def show_gender_genre_comparison(): plt.rcParams.update({'font.size': 28}) # This plot shows the genres = ["Action", "Adventure", "Animation", "Children\'s", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"] movie_genre = MD.load_movie_genre_matrix_1m() male_genre = np.zeros(shape=(len(genres, ))) female_genre = np.zeros(shape=(len(genres, ))) user_gender_dict = MD.gender_user_dictionary_1m() user_genre = MD.load_user_genre_matrix_1m() for user_index, user in enumerate(user_genre): if user_gender_dict[user_index] == "M": male_genre += user else: female_genre += user print(male_genre) print(female_genre) x = np.arange(len(genres)) ax = plt.subplot(111) ax.bar(x-0.2, male_genre/750000, width=0.4, label='male') ax.bar(x+0.2, female_genre/250000, width=0.4, label='female') plt.xticks(x, ("Action", "Adventure", "Animation", "Children\'s", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western")) plt.legend() plt.tight_layout() plt.setp(ax.get_xticklabels(), rotation=40) plt.ylabel("Normalized rating count") plt.xlabel("Genres") plt.show()
def show_avg_rating_gender_per_movie(movie_id=1): gender_dict = MD.gender_user_dictionary_1m() user_item = MD.load_user_item_matrix_1m() ratings = user_item[:, movie_id] male_ratings = [] female_ratings = [] for user_id, rating in enumerate(ratings): if rating > 0: if gender_dict[user_id] == 'M': male_ratings.append(rating) else: female_ratings.append(rating) plt.bar(["male", "female"], [np.average(male_ratings), np.average(female_ratings)]) plt.show()
def loyal_vs_diverse(): #X = MD.load_user_item_matrix_1m() #T = MD.load_gender_vector_1m() genres = [ "Action", "Adventure", "Animation", "Children\'s", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western" ] movie_genre = MD.load_movie_genre_matrix_1m(combine=True) user_genre_distr = np.zeros(shape=(6040, movie_genre.shape[1])) user_gender_dict = MD.gender_user_dictionary_1m() print(user_genre_distr.shape) with open("ml-1m/ratings.dat", 'r') as f: for line in f.readlines(): user_id, movie_id, rating, _ = line.split("::") movie_id = int(movie_id) - 1 user_id = int(user_id) - 1 user_genre_distr[user_id, :] += movie_genre[movie_id, :] loyal_percents = [0.5, 0.6, 0.7] for i, loyal_percent in enumerate(loyal_percents): loyal_count = 0 for user_index, user in enumerate(user_genre_distr): if max(user) / sum(user) > loyal_percent: if True: #print the user: print(user_gender_dict[user_index]) top_5_index = user.argsort()[-5:][::-1] for index in top_5_index: print(genres[index], user[index]) loyal_count += 1 print("For threshold", loyal_percent, ",", loyal_count, "users are considered loyal") if True: user_loyalty_male = [] user_loyalty_female = [] for user_index, user in enumerate(user_genre_distr): loyalty = max(user) / sum(user) if user_gender_dict[user_index] == 'M': user_loyalty_male.append(loyalty) plt.scatter(user_index, loyalty, c='b') else: user_loyalty_female.append(loyalty) plt.scatter(user_index, loyalty, c='r') print(np.average(user_loyalty_male)) print(np.average(user_loyalty_female))
def test_avg_rating_gender_per_movie_1m(): import MovieLensData as MD from scipy.stats import ttest_ind, mannwhitneyu gender_dict = MD.gender_user_dictionary_1m() user_item = MD.load_user_item_matrix_1m() movies = {} with open("ml-1m/movies.dat", 'r') as f: for line in f.readlines(): id, name, genre = line.replace("\n", "").split("::") movies[int(id)] = name + "::" + genre counter = 0 print(len(user_item[0])) for movie_id in range(len(user_item[0])): ratings = user_item[:, movie_id] male_ratings = [] female_ratings = [] for user_id, rating in enumerate(ratings): if rating > 0: if gender_dict[user_id] == 'M': male_ratings.append(rating) else: female_ratings.append(rating) try: _, p_value = mannwhitneyu(male_ratings, female_ratings) if p_value < 0.05 / len(user_item[0]): #print(movie_id+1, "%.2f" % np.average(male_ratings), len(male_ratings), "%.2f" % np.average(female_ratings), len(female_ratings), p_value) counter += 1 #plt.bar(["male", "female"], [np.average(male_ratings), np.average(female_ratings)]) #plt.show() if np.average(male_ratings) > np.average(female_ratings): print( str(movie_id + 1) + "::" + movies[movie_id + 1] + "::M") if np.average(male_ratings) < np.average(female_ratings): print( str(movie_id + 1) + "::" + movies[movie_id + 1] + "::F") except: print("Testing failed for", movie_id) print(str(1 + 1) + "::" + movies[1]) print(counter)