Beispiel #1
0
def show_gender_genre_comparison():
    plt.rcParams.update({'font.size': 28})
    # This plot shows the
    genres = ["Action", "Adventure", "Animation", "Children\'s", "Comedy", "Crime", "Documentary", "Drama",
              "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War",
              "Western"]

    movie_genre = MD.load_movie_genre_matrix_1m()
    male_genre = np.zeros(shape=(len(genres, )))
    female_genre = np.zeros(shape=(len(genres, )))
    user_gender_dict = MD.gender_user_dictionary_1m()
    user_genre = MD.load_user_genre_matrix_1m()
    for user_index, user in enumerate(user_genre):
        if user_gender_dict[user_index] == "M":
            male_genre += user
        else:
            female_genre += user
    print(male_genre)
    print(female_genre)
    x = np.arange(len(genres))
    ax = plt.subplot(111)
    ax.bar(x-0.2, male_genre/750000, width=0.4, label='male')
    ax.bar(x+0.2, female_genre/250000, width=0.4, label='female')
    plt.xticks(x, ("Action", "Adventure", "Animation", "Children\'s", "Comedy", "Crime", "Documentary", "Drama",
              "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War",
              "Western"))
    plt.legend()
    plt.tight_layout()
    plt.setp(ax.get_xticklabels(), rotation=40)
    plt.ylabel("Normalized rating count")
    plt.xlabel("Genres")
    plt.show()
Beispiel #2
0
def show_avg_rating_gender_per_movie(movie_id=1):
    gender_dict = MD.gender_user_dictionary_1m()
    user_item = MD.load_user_item_matrix_1m()
    ratings = user_item[:, movie_id]
    male_ratings = []
    female_ratings = []
    for user_id, rating in enumerate(ratings):
        if rating > 0:
            if gender_dict[user_id] == 'M':
                male_ratings.append(rating)
            else:
                female_ratings.append(rating)

    plt.bar(["male", "female"], [np.average(male_ratings), np.average(female_ratings)])
    plt.show()
Beispiel #3
0
def loyal_vs_diverse():
    #X = MD.load_user_item_matrix_1m()
    #T = MD.load_gender_vector_1m()
    genres = [
        "Action", "Adventure", "Animation", "Children\'s", "Comedy", "Crime",
        "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical",
        "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"
    ]
    movie_genre = MD.load_movie_genre_matrix_1m(combine=True)
    user_genre_distr = np.zeros(shape=(6040, movie_genre.shape[1]))
    user_gender_dict = MD.gender_user_dictionary_1m()
    print(user_genre_distr.shape)
    with open("ml-1m/ratings.dat", 'r') as f:
        for line in f.readlines():
            user_id, movie_id, rating, _ = line.split("::")
            movie_id = int(movie_id) - 1
            user_id = int(user_id) - 1

            user_genre_distr[user_id, :] += movie_genre[movie_id, :]
    loyal_percents = [0.5, 0.6, 0.7]
    for i, loyal_percent in enumerate(loyal_percents):
        loyal_count = 0
        for user_index, user in enumerate(user_genre_distr):
            if max(user) / sum(user) > loyal_percent:
                if True:
                    #print the user:
                    print(user_gender_dict[user_index])
                    top_5_index = user.argsort()[-5:][::-1]
                    for index in top_5_index:
                        print(genres[index], user[index])

                loyal_count += 1
        print("For threshold", loyal_percent, ",", loyal_count,
              "users are considered loyal")

    if True:
        user_loyalty_male = []
        user_loyalty_female = []
        for user_index, user in enumerate(user_genre_distr):
            loyalty = max(user) / sum(user)
            if user_gender_dict[user_index] == 'M':
                user_loyalty_male.append(loyalty)
                plt.scatter(user_index, loyalty, c='b')
            else:
                user_loyalty_female.append(loyalty)
                plt.scatter(user_index, loyalty, c='r')
        print(np.average(user_loyalty_male))
        print(np.average(user_loyalty_female))
Beispiel #4
0
def test_avg_rating_gender_per_movie_1m():
    import MovieLensData as MD
    from scipy.stats import ttest_ind, mannwhitneyu
    gender_dict = MD.gender_user_dictionary_1m()
    user_item = MD.load_user_item_matrix_1m()

    movies = {}
    with open("ml-1m/movies.dat", 'r') as f:
        for line in f.readlines():
            id, name, genre = line.replace("\n", "").split("::")
            movies[int(id)] = name + "::" + genre
    counter = 0
    print(len(user_item[0]))
    for movie_id in range(len(user_item[0])):
        ratings = user_item[:, movie_id]
        male_ratings = []
        female_ratings = []
        for user_id, rating in enumerate(ratings):
            if rating > 0:
                if gender_dict[user_id] == 'M':
                    male_ratings.append(rating)
                else:
                    female_ratings.append(rating)

        try:
            _, p_value = mannwhitneyu(male_ratings, female_ratings)

            if p_value < 0.05 / len(user_item[0]):
                #print(movie_id+1, "%.2f" % np.average(male_ratings), len(male_ratings), "%.2f" % np.average(female_ratings), len(female_ratings), p_value)
                counter += 1
                #plt.bar(["male", "female"], [np.average(male_ratings), np.average(female_ratings)])
                #plt.show()
                if np.average(male_ratings) > np.average(female_ratings):
                    print(
                        str(movie_id + 1) + "::" + movies[movie_id + 1] +
                        "::M")
                if np.average(male_ratings) < np.average(female_ratings):
                    print(
                        str(movie_id + 1) + "::" + movies[movie_id + 1] +
                        "::F")
        except:
            print("Testing failed for", movie_id)

    print(str(1 + 1) + "::" + movies[1])
    print(counter)