def task2b(): DataHandler.vectors() DataHandler.createDictionaries1() coactor_similarity_df, actorList = DataHandler.coactor_siilarity_matrix() u, sigma, vt = decompositions.SVDDecomposition(coactor_similarity_df, 3) semantics = np.matrix(vt).tolist() actorIdActorsDf = DataHandler.actor_info_df actorsInDf = list(coactor_similarity_df.index) print("Top 3 semantics are:") for semantic in semantics: prettyPrintActorVector(semantic, actorsInDf, actorIdActorsDf) print("") split_group_with_index = formatter.splitGroup(u, 3) print("The three groupings are:") groups = tasksBusiness.get_partition_on_ids(split_group_with_index, actorIdActorsDf['name']) for x, v in groups.items(): print('Group ' + str(x + 1) + ' : ' + str(v)) print(" ") print()
def task1a_PCA(userId): DataHandler.vectors() DataHandler.createDictionaries1() movie_tag_df = DataHandler.load_movie_tag_df() u = decompositions.PCADimensionReduction( (movie_tag_df), 5) #Assuming number of latent semantics are 5 decpmposed = pd.DataFrame(u, index=movie_tag_df.index) similarity_df = DataHandler.movie_movie_Similarity1(decpmposed) movie_list = getWeightedSimilarityOrder(similarity_df, userId) user_movie_timestamp_map = DataHandler.user_rated_or_tagged_date_map list(DataHandler.user_rated_or_tagged_date_map[userId]).sort( key=lambda tup: tup[1]) user_watched_movies = {} #Code to get the movies the user has already watched for user, movies in user_movie_timestamp_map.items(): for i in user_movie_timestamp_map[user]: if user not in user_watched_movies: user_watched_movies[user] = [i[0]] else: user_watched_movies[user].append(i[0]) movieid_name_map = DataHandler.movieid_name_map print('Movies similar to the following seed movies: ' + str([movieid_name_map.get(i) for i in user_watched_movies[userId]])) for i in range(0, len(movie_list[0])): print(movieid_name_map[movie_list[0][i]] + ': ' + str(list(movie_list[1])[i]))
def runme(): global q_vector movieid_name_map = DataHandler.movieid_name_map enter_userid = 36 # input("UserID : ") userId = int(enter_userid) times = time.time() DataHandler.vectors() DataHandler.createDictionaries1() loadBase(userId) # runDecomposition(loadPCASemantics) distances = runAllMethods() reco = [nonwatchedList[i] for i in distances][0:5] runAllMethodrelevancefeedback(reco, [1, 1, 1, 0, 0]) new_query = q_vector movies = recommendMovies(new_query) named_movies = [movieid_name_map[i] for i in movies] print('Top 5 movies : ' + str(named_movies)) while True: feedback = input("Relevance (1/0) for each of the 5 movies: ") if feedback == 'exit': print("GoodBye........") break feedback = [int(i) for i in feedback.split(',')] new_query = newQueryFromFeedBack(movies, feedback) print([movieid_name_map[nonwatchedList[i]] for i in new_query][0:5])
def task1a_svd(genre): DataHandler.vectors() DataHandler.createDictionaries1() genre_movie_map = DataHandler.getGenreMoviesMap() if (genre not in genre_movie_map.keys()): print("genre " + genre + " not present in data") return movie_tag_df = DataHandler.load_movie_tag_df() tagIdTagsDf = DataHandler.tag_id_df tagsInDf = list(movie_tag_df.transpose().index) movies = genre_movie_map.get(genre) genre_movie_tags_df = (movie_tag_df.loc[movies]).dropna(how='any') U, Sigma, genre_semantics = decompositions.SVDDecomposition( genre_movie_tags_df, 4) print("The 4 semantics for genre:" + genre + " are") index = 1 for semantic in np.matrix(genre_semantics).tolist(): print("semantic " + str(index) + ": ") prettyPrintTagVector(semantic, tagsInDf, tagIdTagsDf) print("") index = index + 1 return
def PersnalizedPageRank_top5SimilarMovies1(userMovies): DataHandler.createDictionaries1() u = decompositions.CPDecomposition( DataHandler.getTensor_ActorMovieGenreYearRankRating(), 5) movies = sorted(list(DataHandler.movie_actor_map.keys())) u1 = u[1] movieNewDSpace = pd.DataFrame(u1, index=movies) movie_movie_similarity = DataHandler.movie_movie_Similarity1( movieNewDSpace) movieid_name_map = DataHandler.movieid_name_map alpha = constants.ALPHA movie_similarities = ppr.personalizedPageRank(movie_movie_similarity, userMovies, alpha) movies = list(movie_movie_similarity.index) movieDF = pd.DataFrame(pd.Series(movies), columns=['movies']) movieDF['movies'] = movieDF['movies'].map( lambda x: movieid_name_map.get(x)) Result = pd.concat([movie_similarities, movieDF], axis=1) sortedResult = Result.sort_values(by=0, ascending=False).head(15) seedmovieNames = [movieid_name_map.get(i) for i in userMovies] print('Movies similar to the following seed movies: ' + str(seedmovieNames)) movie_genre_map = DataHandler.movie_genre_map genreForSeedMovies = [movie_genre_map.get(i) for i in userMovies] print('Genres for seed movies: ' + str(genreForSeedMovies)) for index in sortedResult.index: if sortedResult.loc[index, 'movies'] not in seedmovieNames: print(sortedResult.loc[index, 'movies'] + ' ' + str(sortedResult.loc[index, 0]) + ' ' + str(movie_genre_map.get(movies[index])))
def task1_2Decompostions(func, userid): movieid_name_map = DataHandler.movieid_name_map enter_userid = userid # input("UserID : ") userId = int(enter_userid) times = time.time() DataHandler.vectors() DataHandler.createDictionaries1() rf.loadBase(userId) rf.runDecomposition(func) new_query = rf.q_vector movies, distances = rf.recommendMovies(new_query) named_movies = [movieid_name_map[i] for i in movies] print('---------------------') print('Top 5 movies : ') print(str(list(zip(named_movies, distances)))) #for i in range(0, len(named_movies)): # print(named_movies[i] + ", " + str(distances[i])) print("---------------------") while True: feedback = input("Relevance (1/0) for each of the 5 movies: ") if feedback == 'exit': print("Exit........") break feedback = [int(i) for i in feedback.split(',')] new_query, weights = rf.newQueryFromFeedBack(movies, feedback) # print(str(new_query) + "\n") print([movieid_name_map[rf.nonwatchedList[i]] for i in new_query][0:5])
def task1_2CombinedPredictor(userid): movieid_name_map = DataHandler.movieid_name_map enter_userid = userid # input("UserID : ") userId = int(enter_userid) times = time.time() DataHandler.vectors() DataHandler.createDictionaries1() rf.loadBase(userId) similarities, sortedSimilarity = rf.runAllMethods(userid) movies = [rf.nonwatchedList[i] for i in similarities][0:5] moviesWatched_timestamp = list( DataHandler.user_rated_or_tagged_date_map.get(userId)) moviesWatched_timestamp = sorted(moviesWatched_timestamp, key=itemgetter(1)) moviesWatched_timestamp_sorted = list( list(zip(*moviesWatched_timestamp))[0]) watchedMovieNames = [ movieid_name_map[movieid] for movieid in moviesWatched_timestamp_sorted ] print('-------------------------------------') print('Movies Watched by the user in order: ' + str(watchedMovieNames)) named_movies = [movieid_name_map[i] for i in movies] print('Top 5 movies : ' + str(list(zip(named_movies, sortedSimilarity)))) print('-------------------------------------') while True: feedback = input("Relevance (1/0) for each of the 5 movies: ") if feedback == 'exit': print("Exit........") break feedback = [int(i) for i in feedback.split(',')] new_query = rf.runAllMethodrelevancefeedback(movies, feedback) print([movieid_name_map[rf.nonwatchedList[i]] for i in new_query][0:5])
def genre_spaceActors_LDA_tf(genre): DataHandler.vectors() DataHandler.createDictionaries1() movie_tag_map, tag_id_map, actor_movie_rank_map, movie_actor_rank_map = DataHandler.get_dicts( ) DataHandler.create_actor_actorid_map() actor_actorid_map = DataHandler.actor_actorid_map df = DataHandler.load_genre_actor_matrix_tf(genre) gmMap = DataHandler.genre_movie_map if (genre not in list(gmMap.keys())): print("genre " + genre + " not in data") return ldaModel, doc_term_matrix, id_Term_map = decompositions.LDADecomposition( df, 4, constants.genreActorSpacePasses) topic_terms = defaultdict(set) for i in range(0, 4): for tuples in ldaModel.get_topic_terms( i, topn=len(actor_actorid_map) ): #get_topics_terms returns top n(default = 10) words of the topics term = id_Term_map.get(tuples[0]) topic_terms[i].add((actor_actorid_map.get(term), tuples[1])) for i in range(0, 4): print('Semantic ' + str(i + 1) + ' ' + str(sorted(topic_terms.get(i), key=itemgetter(1), reverse=True))) print('\n')
def task5_3(): DataHandler.createDictionaries1() movieid_name_map = DataHandler.movieid_name_map movie_tag = DataHandler.load_movie_tag_df() # allMovieData = pd.DataFrame(DataHandler.load_dataForClassifiers(),index=list(movie_tag.index)) allMovieData = pickle.load( open(constants.DIRECTORY + "movie_feature_df2", "rb")) train_movies_Matrix, train_label, train_movieids, test_movies_Matrix, test_movieids = createTrainTestData( allMovieData) uniqueLabels = list(set(train_label)) for i in range(len(uniqueLabels)): labeli_index = [ j for j, x in enumerate(train_label) if x == uniqueLabels[i] ] for k in labeli_index: train_label[k] = i svmModel = binarySVM.BinarySVM() svmModel.fit(train_movies_Matrix, train_label) predictions = [ uniqueLabels[int(np.asscalar(i))] for i in svmModel.predict(test_movies_Matrix) ] test_movieids_names = [movieid_name_map[mid] for mid in test_movieids] print("Results for SVM classifier as (Movie Name, Label): \n" + str(list(zip(test_movieids_names, predictions))) + "\n")
def task1_2PageRank(): userid = input("UserID : ") DataHandler.vectors() enter_userid = userid # input("UserID : ") userId = int(enter_userid) DataHandler.createDictionaries1() rf.loadBase(userId) rf.task1d(userId)
def task3a(seed): DataHandler.createDictionaries1() actor_movie_rank_map = DataHandler.actor_movie_rank_map for s in seed: if s not in actor_movie_rank_map: print('Invalid seed actor id : ' + str(s)) return tasksBusiness.PersnalizedPageRank_top10_SimilarActors(seed)
def top10_Actors_LDA_tf(givenActor): DataHandler.createDictionaries1() actor_movie_rank_map = DataHandler.actor_movie_rank_map if givenActor not in actor_movie_rank_map: print('Invalid seed actor id : '+str(givenActor)) return DataHandler.create_actor_actorid_map() top10SimilarActors_similarity = DataHandler.similarActors_LDA_tf(givenActor) print('Actors similar to '+str(DataHandler.actor_actorid_map[givenActor])) for actor,sim in top10SimilarActors_similarity: print(DataHandler.actor_actorid_map[actor]+' '+str(sim)) return
def top5SimilarMovies1(userMovies): DataHandler.createDictionaries1() u = decompositions.CPDecomposition(DataHandler.getTensor_ActorMovieGenreYearRankRating(),5) movies = sorted(list(DataHandler.movie_actor_map.keys())) u1= u[1] movieNewDSpace = pd.DataFrame(u1,index = movies) movie_movie_similarity = DataHandler.movie_movie_Similarity1(movieNewDSpace) movieid_name_map = DataHandler.movieid_name_map alpha = constants.ALPHA movie_similarities = pagerank.PPR(movie_movie_similarity,userMovies,alpha) print('Movies similar to the following seed movies: '+str([movieid_name_map.get(i) for i in userMovies])) for index,sim in movie_similarities: if (movie_movie_similarity.columns[index] not in userMovies): print(movieid_name_map.get(movie_movie_similarity.columns[index])+' '+ str(sim))
def task1dImplementation_SVD(movie_id): DataHandler.vectors() DataHandler.createDictionaries1() movieid_name_map = DataHandler.movieid_name_map actor_tag_df = DataHandler.actor_tag_df() movie_tag_df = DataHandler.load_movie_tag_df() moviesIndexList = list(movie_tag_df.index) actorsIndexList = list(actor_tag_df.index) actorsSize = len(actorsIndexList) if (movie_id not in moviesIndexList): print("Movie " + movieid_name_map.get(movie_id) + " not present in mltags data. Quitting") return actorU, actorSigma, actorV = decompositions.SVDDecomposition( actor_tag_df, 5) tagsToActorSemantics = (np.matrix(actorV)).transpose() movieTagMatrix = np.matrix(movie_tag_df.as_matrix()) movieInTags = movieTagMatrix[moviesIndexList.index(movie_id)] movieInActorSemantics = (movieInTags * tagsToActorSemantics).tolist()[0] actorsInSemantics = np.matrix(actorU) actorsWithScores = [] DataHandler.create_actor_actorid_map() actorsForMovie = DataHandler.movie_actor_map.get(movie_id) for index in range(0, actorsSize): actor_id = actorsIndexList[index] if actor_id in actorsForMovie: continue actorMatrix = actorsInSemantics[index] actor = (actorMatrix.tolist())[0] actorName = DataHandler.actor_actorid_map.get(actor_id) similarityScore = metrics.l2Norm(actor, movieInActorSemantics) actorsWithScores.append((similarityScore, actorName)) resultActors = sorted(actorsWithScores, key=operator.itemgetter(0), reverse=False) top10Actors = resultActors[0:10] print("10 Actors similar to movie " + str(movieid_name_map.get(movie_id)) + " are: ") for tup in top10Actors: print(tup[1] + " : " + str(tup[0])) return
def task1d_pca(movie_id): DataHandler.vectors() DataHandler.createDictionaries1() movieid_name_map = DataHandler.movieid_name_map actor_tag_df = DataHandler.actor_tag_df() movie_tag_df = DataHandler.load_movie_tag_df() actorTagMatrix = np.matrix(actor_tag_df.as_matrix()) movieTagMatrix = np.matrix(movie_tag_df.as_matrix()) actorIndexList = list(actor_tag_df.index) movieIndexList = list(movie_tag_df.index) if (movie_id not in movieIndexList): print("Movie " + movieid_name_map.get(movie_id) + " not present in mltags data. Quitting") return actorSemantics = decompositions.PCADecomposition(actor_tag_df, 5) actorP = np.matrix(actorSemantics).transpose() movieInTags = movieTagMatrix[movieIndexList.index(movie_id)] movieInActorSemantics = (movieInTags * actorP).tolist()[0] actorsInActorSemantics = (actorTagMatrix * actorP).tolist() DataHandler.create_actor_actorid_map() actorsForMovie = DataHandler.movie_actor_map.get(movie_id) DataHandler.create_actor_actorid_map() actorsSize = len(actorsInActorSemantics) simAndActor = [] for index in range(0, actorsSize): actorId = actorIndexList[index] if (actorId in actorsForMovie): continue actorInSemantics = actorsInActorSemantics[index] actorName = DataHandler.actor_actorid_map.get(actorId) score = metrics.l2Norm(actorInSemantics, movieInActorSemantics) simAndActor.append((score, actorName)) result = sorted(simAndActor, key=operator.itemgetter(0), reverse=False) print("Top 10 actors similar to movie: " + str(movieid_name_map.get(movie_id)) + " are: ") top10Actors = result[0:10] for tup in top10Actors: print(tup[1] + " : " + str(tup[0])) return
def Recommender(userId): DataHandler.createDictionaries1() movieRatedSeed = DataHandler.userMovieRatings(userId) actor_movie_rank_map = DataHandler.actor_movie_rank_map decomposed = decompositions.CPDecomposition(DataHandler.getTensor_ActorMovieGenre(),5) moviesList = sorted(list(DataHandler.movie_actor_rank_map.keys())) movie_movie_similarity = DataHandler.movie_movie_Similarity1(pd.DataFrame(decomposed[1],index=moviesList)) prData = ppr.personalizedPageRankWeighted(movie_movie_similarity, movieRatedSeed, 0.9) rankedItems = sorted(list(map(lambda x:(moviesList[x[0]],x[1]),prData.itertuples())),key=lambda x:x[1], reverse=True) movieid_name_map = DataHandler.movieid_name_map seedmovieNames = [movieid_name_map[k] for k,y in movieRatedSeed] print("Movies similar to the users seed movies " + str(seedmovieNames) + " are:") return [(movieid_name_map[k],y) for (k,y) in rankedItems if k not in [k for k,y in movieRatedSeed]]
def PersnalizedPageRank_top10_SimilarCoActors(seed): DataHandler.createDictionaries1() DataHandler.create_actor_actorid_map() coactcoact, ignoreVariable = DataHandler.coactor_siilarity_matrix() actor_actorid_map = DataHandler.actor_actorid_map alpha = constants.ALPHA act_similarities = ppr.personalizedPageRank(coactcoact,seed,alpha) actors = list(coactcoact.index) actorDF = pd.DataFrame(pd.Series(actors),columns = ['Actor']) actorDF['Actor'] = actorDF['Actor'].map(lambda x:actor_actorid_map.get(x)) Result = pd.concat([act_similarities,actorDF],axis = 1) sortedResult=Result.sort_values(by=0,ascending=False).head(15) seedAcotorNames = [actor_actorid_map.get(i) for i in seed] print('Co Actors similar to the following seed actors: '+str(seedAcotorNames)) for index in sortedResult.index: if sortedResult.loc[index,'Actor'] not in seedAcotorNames: print(sortedResult.loc[index,'Actor']+' '+ str(sortedResult.loc[index,0]))
def task1d_tfidf(movie_id): DataHandler.vectors() DataHandler.createDictionaries1() actorTagDataframe = DataHandler.actor_tag_df() movie_tag_df = DataHandler.load_movie_tag_df() movieid_name_map = DataHandler.movieid_name_map actorsTags = np.matrix(actorTagDataframe.as_matrix()).tolist() actorIndexList = list(actorTagDataframe.index) movieIndexList = list(movie_tag_df.index) movieTagMatrix = np.matrix(movie_tag_df.as_matrix()) if (movie_id not in movieIndexList): print("Movie " + movieid_name_map.get(movie_id) + " not present in mltags data. Quitting") return actorsForMovie = DataHandler.movie_actor_map.get(movie_id) simAndActor = [] movieInTags = movieTagMatrix[movieIndexList.index(movie_id)].tolist()[0] totalActors = len(actorIndexList) DataHandler.create_actor_actorid_map() for index in range(0, totalActors): actorId = actorIndexList[index] if (actorId in actorsForMovie): continue actorName = DataHandler.actor_actorid_map.get(actorId) actorinTags = actorsTags[index] comparisonScore = metrics.l2Norm(movieInTags, actorinTags) simAndActor.append((comparisonScore, actorName)) result = sorted(simAndActor, key=operator.itemgetter(0), reverse=False) top10Actors = result[0:10] print("Top 10 actors similar to " + str(movieid_name_map.get(movie_id)) + " are: ") for tup in top10Actors: print(tup[1] + " : " + str(tup[0])) return
def task1b_svd(genre): DataHandler.vectors() DataHandler.createDictionaries1() actorIdActorsDf = DataHandler.actor_info_df genre_actor_tags_df = DataHandler.load_genre_actor_matrix(genre) gmMap = DataHandler.genre_movie_map if (genre not in list(gmMap.keys())): print("genre " + genre + " not present in data\n") return actorsInDf = list(genre_actor_tags_df.transpose().index) genre_semantics = decompositions.PCADecomposition(genre_actor_tags_df, 4) print("The 4 semantics for genre:" + genre + " are") index = 1 for semantic in np.matrix(genre_semantics).tolist(): print("semantic " + str(index) + ": ") prettyPrintActorVector(semantic, actorsInDf, actorIdActorsDf) print("") index = index + 1 return
def similarMovieActor_LDA(givenMovie): DataHandler.vectors() DataHandler.createDictionaries1() DataHandler.create_actor_actorid_map() givenActor_similarity = defaultdict(float) actor_tag_dff = DataHandler.actor_tag_df() movie_tag_dff = DataHandler.load_movie_tag_df() actorTagMatrix = np.matrix(actor_tag_dff.as_matrix()) movieTagMatrix = np.matrix(movie_tag_dff.as_matrix()) movieid_name_map = DataHandler.movieid_name_map actorIndexList = list(actor_tag_dff.index) movieIndexList = list(movie_tag_dff.index) if (givenMovie not in movieIndexList): print("Movie " + movieid_name_map.get(givenMovie) + " not present in mltags data. Quitting") return movieInTags = movieTagMatrix[movieIndexList.index(givenMovie)] actorsForMovie = DataHandler.movie_actor_map.get(givenMovie) ldaModel, doc_term_matrix, id_Term_map = decompositions.LDADecomposition( actor_tag_dff, 5, constants.actorTagsSpacePasses) for otherActor in actorIndexList: mo1 = DataHandler.representDocInLDATopics(movie_tag_dff, givenMovie, ldaModel) if otherActor not in actorsForMovie: ac2 = DataHandler.representDocInLDATopics(actor_tag_dff, otherActor, ldaModel) givenActor_similarity[otherActor] = ( metrics.simlarity_kullback_leibler(mo1, ac2)) #print(sorted(givenActor_similarity.items(),key = itemgetter(1),reverse=True)) top10 = sorted(givenActor_similarity.items(), key=itemgetter(1), reverse=False)[0:11] for actors in top10: print(DataHandler.actor_actorid_map.get(actors[0]), actors[1]) return
def task1_2LDA(): userid = input("UserID : ") movieid_name_map = DataHandler.movieid_name_map enter_userid = userid # input("UserID : ") userid = int(enter_userid) DataHandler.vectors() DataHandler.createDictionaries1() rf.loadBase(userid) finalWeights = rf.finalWeights movie_movie_similarity_subset_new = rf.runLDADecomposition(userid) #update sim = list( movie_movie_similarity_subset_new.T.dot(finalWeights).astype( np.float32)) movieList = list(movie_movie_similarity_subset_new.columns) simSorted = list(np.sort(sim)[::-1])[:5] simArgSorted = list(np.argsort(sim)[::-1]) movies = [movieList[i] for i in simArgSorted][:5] named_movies = [movieid_name_map[movie] for movie in movies] watchedMovieNames = [ movieid_name_map[movieid] for movieid in rf.moviesWatched ] print(watchedMovieNames) print("---------------------------------------------") print('Top 5 movies and their similarity scores: \n' + str(list(zip(named_movies, simSorted))) + "\n") wantFeedback = True while wantFeedback: feedbackWant = input("Would you like to give feedback 'Y'/'N': ") if feedbackWant == 'Y': LDAFeedback(movies) wantFeedback = True elif feedbackWant == 'N': wantFeedback = False break else: print("Invalid Input provided. Please try again.") wantFeedback = True
def task5_2(): DataHandler.vectors() DataHandler.createDictionaries1() movieid_name_map = DataHandler.movieid_name_map movie_tag = pd.read_pickle(constants.DIRECTORY + "movie_tag_df.pickle") # classifier_df = DataHandler.load_dataForClassifiers() # allMovieData = pd.DataFrame(classifier_df[1], index=list(movie_tag.index)) allMovieData = pickle.load( open(constants.DIRECTORY + "movie_feature_df2", "rb")) #DataHandler.moviemaker(list(movie_tag.index)) train_movies_Matrix, train_label, train_movieids, test_movies_Matrix, test_movieids = createTrainTestData( allMovieData) uniqueLabels = list(set(train_label)) for i in range(len(uniqueLabels)): labeli_index = [ j for j, x in enumerate(train_label) if x == uniqueLabels[i] ] for k in labeli_index: train_label[k] = i # train_movies_Matrix=np.insert(train_movies_Matrix,train_movies_Matrix.shape[1]-1,train_label) train_movies_Matrix_DF = pd.DataFrame(train_movies_Matrix) train_movies_Matrix_DF['label'] = pd.Series(train_label) dtModel = DT.DecisionTree() dtModel.fit( train_movies_Matrix_DF[list(range(train_movies_Matrix.shape[1]))], train_movies_Matrix_DF['label']) predictions = [ uniqueLabels[i] for i in dtModel.predict(pd.DataFrame(test_movies_Matrix)) ] test_movieids_names = [movieid_name_map[mid] for mid in test_movieids] print("Results for Decision Tree classifier as (Movie Name, Label): \n" + str(list(zip(test_movieids_names, predictions))) + "\n")
def task5_1(): classify = True while classify: r = input("Please enter the number of nearest neighbors 'r': ") if not r.isdigit(): print( "A Non Integer was given as input. r should be a non zero positive integer. Please try again\n" ) classify = True continue else: r = int(r) classify = False if r == 0: print( "0 was given as input. r should be a non zero positive integer. Please try again\n" ) classify = True continue DataHandler.createDictionaries1() movieid_name_map = DataHandler.movieid_name_map allMovieData = DataHandler.load_movie_tag_df() train_movies_Matrix, train_label, train_movieids, test_movies_Matrix, test_movieids = createTrainTestData( allMovieData) trainSparseMatrix = sparse.csr_matrix(train_movies_Matrix) testSparseMatrix = sparse.csr_matrix(test_movies_Matrix) NNForAllTest = knn.NN(trainSparseMatrix, testSparseMatrix) maxKNNLabels = knn.sortAllNNAndGetLabels(NNForAllTest, r, train_label) predictions = [ max(set(NNLabels[0:r]), key=NNLabels[0:r].count) for NNLabels in maxKNNLabels ] test_movieids_names = [movieid_name_map[mid] for mid in test_movieids] print( "Results for rNearestNeighbors classifier as (Movie Name, Label): \n" + str(list(zip(test_movieids_names, predictions))) + "\n")
def task1c(userId): global wt DataHandler.createDictionaries1() decomposed = decompositions.CPDecomposition( DataHandler.getTensor_ActorMovieGenre(), 5) moviesList = sorted(list(DataHandler.movie_actor_rank_map.keys())) movie_movie_similarity = DataHandler.movie_movie_Similarity1( pd.DataFrame(decomposed[1], index=moviesList)) moviesWatched_timestamp = list( DataHandler.user_rated_or_tagged_date_map.get(userId)) moviesWatched_timestamp = sorted(moviesWatched_timestamp, key=itemgetter(1)) moviesWatched_timestamp_sorted = list( list(zip(*moviesWatched_timestamp))[0]) resultMovies = getWeightedSimilarityOrder(movie_movie_similarity, userId) movieid_name_map = DataHandler.movieid_name_map resultMovieNames = [movieid_name_map[movieid] for movieid in resultMovies] watchedMovieNames = [ movieid_name_map[movieid] for movieid in moviesWatched_timestamp_sorted ] print('Movies Watched by the user in order: ' + str(watchedMovieNames)) print('Top 5 movies : ' + str(resultMovieNames))
def task3(): #3.1 DataHandler.createDictionaries1() movieid_name_map = DataHandler.movieid_name_map MoviesinLatentSpace = pd.read_csv(constants.DIRECTORY + 'MoviesinLatentSpace_SVD_MDS.csv', index_col=0) SemanticsInTagsDf = pd.read_csv(constants.DIRECTORY + 'MoviesinLatentSpace_SVD_MDS.csv', index_col=0) moviesList = list(MoviesinLatentSpace.index) MoviesinLatentSpace_Matrix = np.matrix(MoviesinLatentSpace, dtype=np.float32) print("Mapped all the movies to 500 dimensional space\n") d = len(MoviesinLatentSpace.columns) w = constants.W MoviesinLatentSpace_Matrix = np.matrix(MoviesinLatentSpace, dtype=np.float32) inputFile = pd.read_csv(constants.DIRECTORY + 'Task3_MovieIds.csv', header=None) movieids_input = list(inputFile[0]) num_moviesForIndexing = len(movieids_input) movieidsIndices_input = [moviesList.index(mid) for mid in movieids_input] MoviesinLatentSpace_Matrix_Input = MoviesinLatentSpace_Matrix[ movieidsIndices_input] indexing = True while indexing: L = input("Please enter the number of Layers 'L': ") if not L.isdigit(): print( "A Non Integer was given as input. L should be an integer.\n") indexing = True continue else: L = int(L) k = input("Please enter the number of hashes per layer 'k': ") if not k.isdigit(): print( "A Non Integer was given as input. k should be an integer.\n") indexing = True continue else: k = int(k) print("Creating the index structure, considering " + str(num_moviesForIndexing) + " movies") #layerTables stores L*K random 'd' dimensional vectors and random offset values 'b' #LHashTables_result constains hashtables for each layer with keys provided by it's K hash functions and values as the movie indices layerTables, LHashTables_result = lsh.createAndGetLSH_IndexStructure( L, k, d, w, MoviesinLatentSpace_Matrix_Input) print("Index Structure Created\n") indexing = False reIndex = False doSearch = False exitVar = False takeUserInput = True while not exitVar: wantFeedback = True if takeUserInput: print("To Re-Index the index structure Press 'R'") print("To perform rNearestNeigbhor Search Press 'S'") print("To Exit Press 'X'") userInput = input("Your Response: ") if userInput == 'X': print("Exiting..") break elif userInput == "R": print("Re-Indexing..") reIndex = True elif userInput == "S": doSearch = True else: print("Invalid input. Please choose among the following: \n") takeUserInput = True continue if reIndex: reIndex = True while reIndex: L = input("Please enter the number of Layers 'L': ") if not L.isdigit(): print( "A Non Integer was given as input. L should be an integer. Please try again\n" ) reIndex = True continue else: L = int(L) k = input("Please enter the number of hashes per layer 'k': ") if not k.isdigit(): print( "A Non Integer was given as input. k should be an integer. Please try again\n" ) reIndex = True continue else: k = int(k) reIndex = False print("Creating the index structure, considering " + str(num_moviesForIndexing) + " movies") layerTables, LHashTables_result = lsh.createAndGetLSH_IndexStructure( L, k, d, w, MoviesinLatentSpace_Matrix_Input) print("Index Structure Created Again\n") reIndex = False if doSearch: doSearch = True while doSearch: movieid = input("Please enter a movieID: ") if not movieid.isdigit(): print( "A Non Integer was given as input. movieid should be an integer. Please try again\n" ) doSearch = True # takeUserInput = False # reIndex = False continue else: movieid = int(movieid) doSearch = False if movieid not in MoviesinLatentSpace.index: print( "The given movieid does not exist. Please try again\n") doSearch = True # takeUserInput = False # reIndex = False continue r = input("Please enter the number of nearest neighbors 'r': ") if not r.isdigit(): print( "A Non Integer was given as input. r should be a non zero positive integer. Please try again\n" ) doSearch = True # takeUserInput = False # reIndex = False continue else: r = int(r) doSearch = False if r == 0: print( "0 was given as input. r should be a non zero positive integer. Please try again\n" ) doSearch = True takeUserInput = False reIndex = False continue moviePoint = MoviesinLatentSpace_Matrix[moviesList.index( movieid)].astype(np.float32) nearestMovies, nearestMoviesBruteForce, nearestMoviesDistance, nearestMoviesDistanceBruteForce = rNearestNeighborSimilarMovies.getRNearestNeighbors( movieid, moviePoint, r, MoviesinLatentSpace, layerTables, LHashTables_result, movieidsIndices_input, movieids_input) nearestMoviesDistance, nearestMoviesDistanceBruteForce = list( np.array(nearestMoviesDistance)[0])[:r], list( np.array(nearestMoviesDistanceBruteForce)[0])[:r] if len(nearestMovies) == 0: print( "The LSH based index structure didn't map any other movie in the same buckets.\n" ) continue if len(nearestMovies) != r: print( "The LSH based index structure didn't map enough movies in the same buckets.\n" ) nearestMoviesNames = [ movieid_name_map[mid] for mid in nearestMovies ] nearestMoviesBruteForceNames = [ movieid_name_map[mid] for mid in nearestMoviesBruteForce ] print("Movies Similar to '" + str(movieid_name_map[movieid]) + "'\n") print( "Results based on the LSH based rNearestNeighbors and their distance scores: \n" + str(list(zip(nearestMoviesNames, nearestMoviesDistance))) + "\n") print( "Results based on Brute Force rNearestNeighbors and their distance scores: \n" + str( list( zip(nearestMoviesBruteForceNames, nearestMoviesDistanceBruteForce))) + "\n") while wantFeedback: feedback = input("Would you like to give feedback 'Y'/'N': ") if feedback == 'Y': task4(moviePoint, r, movieid, LHashTables_result, MoviesinLatentSpace, layerTables, nearestMovies, movieidsIndices_input, movieids_input, SemanticsInTagsDf) wantFeedback = True elif feedback == 'N': wantFeedback = False else: print("Invalid Input provided. Please try again.") wantFeedback = True takeUserInput = True