コード例 #1
0
def show_gender_genre_comparison():
    plt.rcParams.update({'font.size': 28})
    # This plot shows the
    genres = ["Action", "Adventure", "Animation", "Children\'s", "Comedy", "Crime", "Documentary", "Drama",
              "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War",
              "Western"]

    movie_genre = MD.load_movie_genre_matrix_1m()
    male_genre = np.zeros(shape=(len(genres, )))
    female_genre = np.zeros(shape=(len(genres, )))
    user_gender_dict = MD.gender_user_dictionary_1m()
    user_genre = MD.load_user_genre_matrix_1m()
    for user_index, user in enumerate(user_genre):
        if user_gender_dict[user_index] == "M":
            male_genre += user
        else:
            female_genre += user
    print(male_genre)
    print(female_genre)
    x = np.arange(len(genres))
    ax = plt.subplot(111)
    ax.bar(x-0.2, male_genre/750000, width=0.4, label='male')
    ax.bar(x+0.2, female_genre/250000, width=0.4, label='female')
    plt.xticks(x, ("Action", "Adventure", "Animation", "Children\'s", "Comedy", "Crime", "Documentary", "Drama",
              "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War",
              "Western"))
    plt.legend()
    plt.tight_layout()
    plt.setp(ax.get_xticklabels(), rotation=40)
    plt.ylabel("Normalized rating count")
    plt.xlabel("Genres")
    plt.show()
コード例 #2
0
ファイル: Obfuscation.py プロジェクト: STrucks/BlurMore
def rating_add_1m():
    # add a percentage of random ratings to a user:
    X = MD.load_user_item_matrix_1m()
    X_obf = MD.load_user_item_matrix_1m()
    percentage = 0.05
    for user_index, user in enumerate(X):
        nr_ratings = 0
        for rating in user:
            if rating > 0:
                nr_ratings += 1

        added = 0
        safety_counter = 0
        while added < nr_ratings*percentage and safety_counter < 100:
            index = np.random.randint(0,len(user))
            if X_obf[user_index, index] > 0:
                safety_counter += 1
                continue
            else:
                X_obf[user_index, index] = np.random.randint(1,6)

    # output the data in a file:
    with open("ml-1m/random_added_obfuscated_" + str(percentage) + ".dat", 'w') as f:
        for index_user, user in enumerate(X_obf):
            for index_movie, rating in enumerate(user):
                if rating > 0:
                    f.write(str(index_user + 1) + "::" + str(index_movie + 1) + "::" + str(
                        int(rating)) + "::000000000\n")
    return X_obf
コード例 #3
0
def avg_rating_diff():
    X = MD.load_user_item_matrix_100k()
    T = MD.load_gender_vector_100k()
    name_dict = MD.load_movie_id_dictionary_100k()
    males_indecies = np.argwhere(T==0)[:,0]
    females_indecies = np.argwhere(T == 1)[:, 0]
    differences = np.zeros(shape=X.shape[1],)
    for movie_index, movie in enumerate(np.transpose(X)):
        avg_m = []
        male_ratings = movie[males_indecies]
        avg_f = []
        female_ratings = movie[females_indecies]
        for m_r, f_r in zip(male_ratings, female_ratings):
            if m_r > 0:
                avg_m.append(m_r)
            if f_r > 0:
                avg_f.append(f_r)
        avg_m = np.average(avg_m)
        avg_f = np.average(avg_f)
        if not (np.isnan(avg_m) or np.isnan(avg_f)):
            differences[movie_index] = avg_m-avg_f
    differences = [[differences[index], index] for index in range(differences.shape[0])]
    differences = np.asarray(list(reversed(sorted(differences))))
    print(differences[0:20,1])
    names = [name_dict[index+1] for index in np.concatenate((differences[0:20,1], differences[-20:,1]))]
    print(names)
    fig, ax = plt.subplots()
    ax.barh(range(40), np.concatenate((differences[0:20,0], differences[-20:,0]), axis=0), align='center')
    ax.set_yticks(range(40))
    ax.set_yticklabels(names)
    ax.invert_yaxis()  # labels read top-to-bottom
    ax.set_xlabel('Difference')
    ax.set_title('Rating difference between males and females')

    plt.show()
コード例 #4
0
def one_million_obfuscated(classifier):
    #X2 = MD.load_user_item_matrix_1m()  # max_user=max_user, max_item=max_item)
    T = MD.load_gender_vector_1m()  # max_user=max_user)
    X1 = MD.load_user_item_matrix_1m()
    X2 = MD.load_user_item_matrix_1m_masked(
        file_index=55)  # max_user=max_user, max_item=max_item)
    #X2 = X1
    print(X1.shape, X2.shape)

    #X1, T = Utils.balance_data(X1, T)
    #X2, T2 = Utils.balance_data(X2, T)
    #X1 = Utils.normalize(X1)
    #X2 = Utils.normalize(X2)
    X_train, T_train = X1[0:int(0.8 * len(X1))], T[0:int(0.8 * len(X1))]
    X_test, T_test = X2[int(0.8 * len(X2)):], T[int(0.8 * len(X2)):]
    print(list(X1[0, :]))
    print(list(X2[0, :]))
    # print(X)
    print("before", X_train.shape)
    # X = Utils.remove_significant_features(X, T)
    # X_train, _ = Utils.random_forest_selection(X_train, T_train)
    # X = feature_selection(X, T, Utils.select_male_female_different)
    print(X_train.shape)
    from sklearn.linear_model import LogisticRegression
    random_state = np.random.RandomState(0)
    model = LogisticRegression(penalty='l2', random_state=random_state)

    Utils.ROC_cv_obf(X1, X2, T, model)

    model = LogisticRegression(penalty='l2', random_state=random_state)
コード例 #5
0
def comp_BM_and_BMpp():
    plt.rcParams.update({'font.size': 28})
    f, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=True)
    interval_start, interval_end = 0, 50
    X = MD.load_user_item_matrix_1m()
    X_movie_count = [sum([1 if x > 0 else 0 for x in movie]) for movie in np.transpose(X)[interval_start:interval_end]]
    movie_count_o = np.asarray(X_movie_count)
    ax1.bar(range(interval_start,interval_end), X_movie_count)
    ax1.set_title("(A)\nOriginal data")
    ax1.set_xlabel("movie ID")
    ax1.set_ylabel("#ratings")
    #ax1.set_xticks(range(1,6), [1,2,3,4,5])
    print("Original Data:", sum(X_movie_count))

    X = MD.load_user_item_matrix_1m_masked(file_index=63)
    X_movie_count = [sum([1 if x > 0 else 0 for x in movie]) for movie in np.transpose(X)[interval_start:interval_end]]
    masked_count = np.asarray(X_movie_count)
    ax2.bar(range(interval_start, interval_end), X_movie_count)
    ax2.set_title("(B)\nBlurMe data")
    ax2.set_xlabel("movie ID")
    ax2.set_ylabel("#ratings")
    print("BlurMe Data:", sum(X_movie_count))

    X = MD.load_user_item_matrix_1m_masked(file_index=75)
    X_movie_count = [sum([1 if x > 0 else 0 for x in movie]) for movie in np.transpose(X)[interval_start:interval_end]]
    masked_count2 = np.asarray(X_movie_count)
    ax3.bar(range(interval_start, interval_end), X_movie_count)
    ax3.set_title("(C)\nBlurM(or)e data")
    ax3.set_xlabel("movie ID")
    ax3.set_ylabel("#ratings")
    print("BlurMe++ Data:", sum(X_movie_count))
    print(movie_count_o-masked_count)
    print(masked_count-masked_count2)
    plt.show()
コード例 #6
0
def one_hundert_k(classifier):
    X = MD.load_user_item_matrix_100k()  # max_user=max_user, max_item=max_item)
    #X = normalize(X)
    T = MD.load_age_vector_100k()  # max_user=max_user)
    #X = chi2_selection(X, T)

    classifier(X, T)
コード例 #7
0
def one_hundert_k(classifier):
    X = MD.load_user_item_matrix_100k(
    )  # max_user=max_user, max_item=max_item)
    T = MD.load_occupation_vector_100k()  # max_user=max_user)
    #X = MD.chi2_selection(X, T)

    classifier(X, T, multiclass=True)
コード例 #8
0
def one_hundert_k_obfuscated(classifier):
    X = MD.load_user_item_matrix_100k_masked(
    )  # max_user=max_user, max_item=max_item)
    T = MD.load_gender_vector_100k()  # max_user=max_user)
    #X = MD.chi2_selection(X, T)

    classifier(X, T)
コード例 #9
0
def one_million(classifier):
    max_user = 6040
    max_item = 3952
    #X = MD.load_user_item_matrix_1m()  # max_user=max_user, max_item=max_item)
    X = MD.load_user_item_matrix_1m_masked(file_index=71)
    T = MD.load_user_genre_matrix_1m(one_hot=True, top=5)
    T = np.argwhere(T==1)[:,1]
    print(min(T), max(T))
    """
    Note that we loose class 13 (Romance. it seems that no one has romance as favourite genre. This kinda makes sense 
    because it correlates so much with drama and comedy.
    """
    import collections
    import matplotlib.pyplot as plt
    counter = collections.Counter(T)
    #plt.bar(counter.keys(), counter.values())
    #plt.xlabel("T")
    #plt.ylabel('frequency')
    #plt.show()
    print(counter)

    X = Utils.normalize(X)
    #print(T)
    #X = MD.feature_selection(X, T, f_regression)
    #X = MD.chi2_selection(X, T)
    classifier(X, T, multiclass=True, nr_classes=17)
コード例 #10
0
def one_million(classifier):
    X = MD.load_user_item_matrix_1m()  # max_user=max_user, max_item=max_item)
    #X = MD.load_user_item_matrix_1m_limited_ratings(limit=1)
    #X = MD.load_user_item_matrix_1m_binary()

    # X = MD.load_user_genre_matrix_100k_obfuscated()
    T = MD.load_gender_vector_1m()  # max_user=max_user)
    #X, T = Utils.balance_data(X, T)

    #X = Utils.normalize(X)
    X = feature_selection(X, T, Utils.select_male_female_different)
    X_train, T_train = X[0:int(0.8 * len(X))], T[0:int(0.8 * len(X))]
    X_test, T_test = X[int(0.8 * len(X)):], T[int(0.8 * len(X)):]

    # print(X)
    print("before", X_train.shape)
    # X = Utils.remove_significant_features(X, T)
    #X_train, _ = Utils.random_forest_selection(X_train, T_train)
    # X = feature_selection(X, T, Utils.select_male_female_different)
    print(X_train.shape)

    # X = Utils.normalize(X)
    # X = Utils.standardize(X)
    # X = chi2_selection(X, T)

    classifier(X_train, T_train)
    from sklearn.linear_model import LogisticRegression
    random_state = np.random.RandomState(0)
    #model = Models.Dominant_Class_Classifier()
    model = LogisticRegression(penalty='l2', random_state=random_state)
    model.fit(X_train, T_train)
    Utils.ROC_plot(X_test, T_test, model)
コード例 #11
0
ファイル: DataExploration.py プロジェクト: STrucks/BlurMore
def test_avg_rating_gender_per_movie_100k():
    import MovieLensData as MD
    from scipy.stats import ttest_ind, mannwhitneyu
    gender_vec = MD.load_gender_vector_100k()
    user_item = MD.load_user_item_matrix_100k()

    movies = {}
    with open("ml-100k/u.item", 'r') as f:
        for line in f.readlines():
            i1 = line.find("|")
            id = line[:i1]
            i2 = line.find("|", i1 + 1)
            name = line[i1 + 1:i2]
            movies[int(id)] = name
    counter = 0
    print(len(user_item[0]))
    for movie_id in range(len(user_item[0])):
        ratings = user_item[:, movie_id]
        male_ratings = []
        female_ratings = []
        for user_id, rating in enumerate(ratings):
            if rating > 0:
                if gender_vec[user_id] == 0:
                    male_ratings.append(rating)
                else:
                    female_ratings.append(rating)
        try:

            if len(male_ratings) == 0:
                male_ratings = np.array([0])
            if len(female_ratings) == 0:
                female_ratings = np.array([0])
            if np.average(male_ratings) == np.average(female_ratings):
                continue

            _, p_value = ttest_ind(male_ratings, female_ratings)
            #print(p_value)
            if p_value < (0.05 / len(user_item[0])):
                #print(movie_id+1, "%.2f" % np.average(male_ratings), len(male_ratings), "%.2f" % np.average(female_ratings), len(female_ratings), p_value)
                counter += 1
                #print(np.average(male_ratings) , np.average(female_ratings))
                #print(male_ratings, female_ratings)
                #plt.bar(["male", "female"], [np.average(male_ratings), np.average(female_ratings)])
                #plt.show()
                if np.average(male_ratings) > np.average(female_ratings):
                    print(
                        str(movie_id + 1) + "::" + movies[movie_id + 1] +
                        "::M")
                if np.average(male_ratings) < np.average(female_ratings):
                    print(
                        str(movie_id + 1) + "::" + movies[movie_id + 1] +
                        "::F")
        except:
            print(male_ratings, female_ratings)
            print("Testing failed for", movie_id)
            continue

    print("counter", counter)
コード例 #12
0
def feature_importance_1m():
    plt.rcParams.update({'font.size': 18})

    from sklearn.ensemble import RandomForestClassifier
    from sklearn.linear_model import LogisticRegression
    import seaborn as sns
    X = MD.load_user_item_matrix_1m()
    T = MD.load_gender_vector_1m()
    importance = np.zeros(shape=(X.shape[1],))
    importance2 = np.zeros(shape=(X.shape[1],))

    for i in range(10):
        model = LogisticRegression()
        model2 = RandomForestClassifier()
        model.fit(X, T)
        model2.fit(X,T)
        importance += model.coef_[0]
        importance2 += model2.feature_importances_
    importance /= 10
    importance2 /= 10

    #plt.bar(range(1,len(importance[0:30])+1), importance[0:30])
    #plt.xlabel("movie index")
    #plt.ylabel("importance")
    #plt.show()
    #sns.distplot(importance, kde=False)
    plt.hist(importance2, bins=np.linspace(0,0.001,50))
    #sns.kdeplot(importance,shade=True,cut=0)
    #sns.rugplot(importance)
    plt.xlabel("importance")
    plt.ylabel("frequency")
    plt.title("Importance of movies distribution")
    plt.show()
    importance_id = zip(importance, range(1,len(importance)+1))
    importance_id = list(reversed(sorted(importance_id)))
    importance_id2 = zip(importance, range(1, len(importance) + 1))
    importance_id2 = list(sorted(importance_id2))
    importance_id3 = zip(importance2, range(1, len(importance2) + 1))
    importance_id3 = list(reversed(sorted(importance_id3)))
    set1 = set()
    set2 = set()
    set3 = set()

    names = MD.load_movie_id_dictionary_1m()
    top = 100
    for (_, id), (_,id2), (_,id3) in zip(importance_id[0:top], importance_id2[0:top], importance_id3[0:top]):
        print(names[id], "|", names[id2], "|", names[id3])
        set1.add(names[id])
        set2.add(names[id2])
        set3.add(names[id3])

    #print(set3)

    print(set3.intersection(set2.union(set1)))

    #print(importance_id)
    """
コード例 #13
0
def rating_distr():
    T = MD.load_gender_vector_1m()
    X = MD.load_user_item_matrix_1m()
    f, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=True)
    frequencies = np.zeros(shape=(6,))
    ratings = []
    movie_ids = []
    for user in X:
        for index, rating in enumerate(user):
            frequencies[int(rating)] += 1
            if rating > 0:
                movie_ids.append(index+1)
                ratings.append(rating)
    print(frequencies)
    print(sum(frequencies[1:]), np.mean(frequencies[1:]), np.var(frequencies[1:]))
    print("mean:", np.dot(np.arange(0, 6), frequencies) / sum(frequencies), "without 0:",
          np.dot(np.arange(1, 6), frequencies[1:]) / sum(frequencies[1:]))
    print("Avg", np.average(ratings), "var", np.var(ratings))
    print(len(set(movie_ids)))
    ax1.bar(range(5), frequencies[1:])
    ax1.set_xlabel("Original")
    X = MD.load_user_item_matrix_1m_masked(file_index=71)# greedy 10%
    frequencies = np.zeros(shape=(6,))
    ratings = []

    for user in X:
        for rating in user:
            frequencies[int(rating)] += 1
            if rating > 0:
                ratings.append(rating)

    print(frequencies)
    print(sum(frequencies[1:]), np.mean(frequencies[1:]), np.var(frequencies[1:]))

    print("mean:", np.dot(np.arange(0, 6), frequencies) / sum(frequencies), "without 0:",
          np.dot(np.arange(1, 6), frequencies[1:]) / sum(frequencies[1:]))
    print("Avg", np.average(ratings), "var", np.var(ratings))

    ax2.bar(range(5), frequencies[1:])
    ax2.set_xlabel("BlurMe")
    X = MD.load_user_item_matrix_1m_masked(file_index=55)# BlurMe++ 10%, fac=2
    frequencies = np.zeros(shape=(6,))
    ratings = []
    for user in X:
        for rating in user:
            frequencies[int(rating)] += 1
            if rating > 0:
                ratings.append(rating)
    print(frequencies)
    print(sum(frequencies[1:]), np.mean(frequencies[1:]), np.var(frequencies[1:]))
    print("mean:", np.dot(np.arange(0, 6), frequencies) / sum(frequencies), "without 0:",
          np.dot(np.arange(1, 6), frequencies[1:]) / sum(frequencies[1:]))
    print("Avg", np.average(ratings), "var", np.var(ratings))

    ax3.bar(range(5), frequencies[1:])
    ax3.set_xlabel("BlurMe++")
    plt.show()
コード例 #14
0
ファイル: DataExploration.py プロジェクト: STrucks/BlurMore
def feature_importance_1m():
    from sklearn.ensemble import RandomForestClassifier
    X = MD.load_user_item_matrix_1m()
    T = MD.load_gender_vector_1m()
    importance = np.zeros(shape=(X.shape[1], ))
    for i in range(10):
        model = RandomForestClassifier()
        model.fit(X, T)
        importance += model.feature_importances_
    importance /= 10
    plt.bar(range(1, len(importance[0:30]) + 1), importance[0:30])
    plt.xlabel("movie index")
    plt.ylabel("importance")
    plt.show()

    counter = 0
    for movie, score in enumerate(importance):
        if score >= 0.002:
            print(movie + 1, end=",")
            counter += 1
    print()
    print(counter)
    nr_ratings = np.zeros(shape=(X.shape[1], ))
    for index, movie in enumerate(np.transpose(X)):
        counter = 0
        for rating in movie:
            if rating > 0:
                counter += 1
        nr_ratings[index] = counter

    avg_nr_per_importance = {}
    nr_ratings_importance = []
    for nr, imp in zip(nr_ratings, importance):
        if imp in avg_nr_per_importance:
            avg_nr_per_importance[imp].append(nr)
        else:
            avg_nr_per_importance[imp] = [nr]
        nr_ratings_importance.append([nr, imp])

    #for key in avg_nr_per_importance.keys():
    #    avg_nr_per_importance[key] = np.average(avg_nr_per_importance[key])
    #print(avg_nr_per_importance)
    plt.subplot(1, 2, 1)
    for nr, imp in nr_ratings_importance:
        plt.scatter(nr, imp)
    plt.xlabel("#ratings")
    plt.ylabel("importance")

    plt.subplot(1, 2, 2)
    for nr, imp in nr_ratings_importance:
        if nr < 100:
            plt.scatter(nr, imp)
    plt.xlabel("#ratings")
    plt.ylabel("importance")

    plt.show()
コード例 #15
0
def one_million(classifier):
    max_user = 6040
    max_item = 3952
    X = MD.load_user_item_matrix_1m()  # max_user=max_user, max_item=max_item)
    T = MD.load_occupation_vector_1m()  # max_user=max_user)
    X = Utils.normalize(X)
    #print(T)
    #X = MD.feature_selection(X, T, f_regression)
    #X = MD.chi2_selection(X, T)
    classifier(X, T, multiclass=True)
コード例 #16
0
def one_million(classifier):
    max_user = 6040
    max_item = 3952
    X = MD.load_user_item_matrix_1m()  # max_user=max_user, max_item=max_item)
    T = MD.load_age_vector_1m(border=30)  # max_user=max_user)
    #X = normalize(X)
    print(min(X[:,0]), np.mean(X[:,0]))
    #X = feature_selection(X, T, f_regression)
    #X = chi2_selection(X, T)
    classifier(X, T)
コード例 #17
0
ファイル: FailureAnalysis.py プロジェクト: STrucks/BlurMore
def lot_ratings():
    X = MD.load_user_item_matrix_1m()
    X = Utils.normalize(X)
    T = MD.load_gender_vector_1m()
    X_train, T_train = X[0:int(0.8 * len(X))], T[0:int(0.8 * len(X))]
    X_test, T_test = X[int(0.8 * len(X)):], T[int(0.8 * len(X)):]
    test_data = list(zip(X_test, T_test))

    from sklearn.linear_model import LogisticRegression

    random_state = np.random.RandomState(0)
    model = LogisticRegression(penalty='l2', random_state=random_state)
    model.fit(X_train, T_train)
    # Utils.ROC_plot(X_test, T_test, model)
    roc = True
    min_rating = [162, 211, 282, 390]

    for index, max_rating in enumerate([210, 281, 389, 1000]):
        selected_X = []
        selected_T = []
        for user, label in test_data:
            counter = 0
            for rating in user:
                if rating > 0:
                    counter += 1
            if min_rating[index] <= counter <= max_rating:
                selected_X.append(user)
                selected_T.append(label)
        probs = model.predict_proba(selected_X)
        preds = probs[:, 1]
        fpr, tpr, threshold = metrics.roc_curve(selected_T, preds)
        roc_auc = metrics.auc(fpr, tpr)

        if roc:
            # method I: plt
            plt.subplot(2, 2, index + 1)
            plt.title(
                'Receiver Operating Characteristic with users having rated between ' + str(max_rating) + " and " + str(
                    min_rating[index]) + ' making N=' + str(len(selected_X)))
            plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % roc_auc)
            plt.legend(loc='lower right')
            plt.plot([0, 1], [0, 1], 'r--')
            plt.xlim([0, 1])
            plt.ylim([0, 1])
            plt.ylabel('True Positive Rate')
            plt.xlabel('False Positive Rate')
        # print the confusion matrix:
        print("For max rating =", max_rating, ":")
        Y = model.predict(selected_X)
        TPR, TNR, FPR, FNR, precision, accuracy = Utils.performance_measures(Y, T)
        print("TPR:", TPR, "TNR:", TNR, "FPR:", FPR, "FNR:", FNR, "precision:", precision, "accuracy:", accuracy)

    if roc:
        plt.show()
コード例 #18
0
def one_hundert_k_obfuscated(classifier):
    X1 = MD.load_user_item_matrix_100k()
    X2 = MD.load_user_item_matrix_100k_masked(file_index=1)  # max_user=max_user, max_item=max_item)
    X3 = MD.load_user_item_matrix_100k_masked(file_index=-1)

    T = MD.load_gender_vector_100k()  # max_user=max_user)
    X_train, T_train = X3[0:int(0.8 * len(X3))], T[0:int(0.8 * len(X3))]
    X_test, T_test = X1[int(0.8 * len(X1)):], T[int(0.8 * len(X1)):]

    from sklearn.linear_model import LogisticRegression
    random_state = np.random.RandomState(0)
    model = LogisticRegression(penalty='l2', random_state=random_state)
    model.fit(X_train, T_train)
    Utils.ROC_plot(X_test, T_test, model)
コード例 #19
0
def plot_genre_1m():
    genres = ["Action", "Adventure", "Animation", "Children\'s", "Comedy", "Crime", "Documentary", "Drama",
              "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War",
              "Western"]
    f, (ax1, ax2) = plt.subplots(1, 2, sharey=True)
    movie_genre = MD.load_movie_genre_matrix_1m(combine=False)
    ax1.bar(genres, np.sum(movie_genre, axis=0))
    ax1.set_title("genre distribution in ML 1m")
    plt.setp(ax1.get_xticklabels(), rotation=-45, ha="left")

    movie_genre = MD.load_movie_genre_matrix_1m(combine=True)
    ax2.bar(genres, np.sum(movie_genre, axis=0))
    ax2.set_title("genre distribution in ML 1m, Drama&Romance are combined to Drama ect.")
    plt.setp(ax2.get_xticklabels(), rotation=-45, ha="left")
    plt.show()
コード例 #20
0
def show_avg_rating_gender_per_movie(movie_id=1):
    gender_dict = MD.gender_user_dictionary_1m()
    user_item = MD.load_user_item_matrix_1m()
    ratings = user_item[:, movie_id]
    male_ratings = []
    female_ratings = []
    for user_id, rating in enumerate(ratings):
        if rating > 0:
            if gender_dict[user_id] == 'M':
                male_ratings.append(rating)
            else:
                female_ratings.append(rating)

    plt.bar(["male", "female"], [np.average(male_ratings), np.average(female_ratings)])
    plt.show()
コード例 #21
0
ファイル: DataExploration.py プロジェクト: STrucks/BlurMore
def loyal_vs_diverse():
    #X = MD.load_user_item_matrix_1m()
    #T = MD.load_gender_vector_1m()
    genres = [
        "Action", "Adventure", "Animation", "Children\'s", "Comedy", "Crime",
        "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical",
        "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"
    ]
    movie_genre = MD.load_movie_genre_matrix_1m(combine=True)
    user_genre_distr = np.zeros(shape=(6040, movie_genre.shape[1]))
    user_gender_dict = MD.gender_user_dictionary_1m()
    print(user_genre_distr.shape)
    with open("ml-1m/ratings.dat", 'r') as f:
        for line in f.readlines():
            user_id, movie_id, rating, _ = line.split("::")
            movie_id = int(movie_id) - 1
            user_id = int(user_id) - 1

            user_genre_distr[user_id, :] += movie_genre[movie_id, :]
    loyal_percents = [0.5, 0.6, 0.7]
    for i, loyal_percent in enumerate(loyal_percents):
        loyal_count = 0
        for user_index, user in enumerate(user_genre_distr):
            if max(user) / sum(user) > loyal_percent:
                if True:
                    #print the user:
                    print(user_gender_dict[user_index])
                    top_5_index = user.argsort()[-5:][::-1]
                    for index in top_5_index:
                        print(genres[index], user[index])

                loyal_count += 1
        print("For threshold", loyal_percent, ",", loyal_count,
              "users are considered loyal")

    if True:
        user_loyalty_male = []
        user_loyalty_female = []
        for user_index, user in enumerate(user_genre_distr):
            loyalty = max(user) / sum(user)
            if user_gender_dict[user_index] == 'M':
                user_loyalty_male.append(loyalty)
                plt.scatter(user_index, loyalty, c='b')
            else:
                user_loyalty_female.append(loyalty)
                plt.scatter(user_index, loyalty, c='r')
        print(np.average(user_loyalty_male))
        print(np.average(user_loyalty_female))
コード例 #22
0
ファイル: FailureAnalysis.py プロジェクト: STrucks/BlurMore
def loyal_ratings():
    X = MD.load_user_item_matrix_1m()
    X = Utils.normalize(X)
    T = MD.load_gender_vector_1m()
    X_train, T_train = X[0:int(0.8 * len(X))], T[0:int(0.8 * len(X))]
    X_test, T_test = X[int(0.8 * len(X)):], T[int(0.8 * len(X)):]
    test_data = list(zip(X_test, T_test))
    Test_indecies = range(int(0.8 * len(X)), len(X))
    print(len(X_test))
    from sklearn.linear_model import LogisticRegression

    random_state = np.random.RandomState(0)
    model = LogisticRegression(penalty='l2', random_state=random_state)
    model.fit(X_train, T_train)
    # Utils.ROC_plot(X_test, T_test, model)
    roc = True
    upper_bound = [0.17,0.19,0.42,1]
    for index, percent_loyal in enumerate([0.0, 0.17, 0.35, 0.42]):
        test_ids = [i+1 for i in Test_indecies]
        selected_ids = Utils.is_loyal(test_ids, loyal_percent_lower=percent_loyal, loyal_percent_upper=upper_bound[index])
        selected_indecies = [i-1 for i in selected_ids]
        selected_X = X[selected_indecies]
        selected_T = T[selected_indecies]

        probs = model.predict_proba(selected_X)
        preds = probs[:, 1]
        fpr, tpr, threshold = metrics.roc_curve(selected_T, preds)
        roc_auc = metrics.auc(fpr, tpr)

        if roc:
            # method I: plt
            plt.subplot(2, 2, index + 1)
            plt.title('Receiver Operating Characteristic with users having a loyality between ' + str(
                percent_loyal) + ' and ' + str(upper_bound[index]) + ' making N=' + str(len(selected_X)))
            plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % roc_auc)
            plt.legend(loc='lower right')
            plt.plot([0, 1], [0, 1], 'r--')
            plt.xlim([0, 1])
            plt.ylim([0, 1])
            plt.ylabel('True Positive Rate')
            plt.xlabel('False Positive Rate')
        # print the confusion matrix:
        print("For loyality =", percent_loyal, ":")
        Y = model.predict(selected_X)
        TPR, TNR, FPR, FNR, precision, accuracy = Utils.performance_measures(Y, T)
        print("TPR:", TPR, "TNR:", TNR, "FPR:", FPR, "FNR:", FNR, "precision:", precision, "accuracy:", accuracy)
    if roc:
        plt.show()
コード例 #23
0
def is_loyal(user_ids, loyal_percent_lower=0.4, loyal_percent_upper=1):
    import MovieLensData as MD
    # X = MD.load_user_item_matrix_1m()
    # T = MD.load_gender_vector_1m()
    genres = [
        "Action", "Adventure", "Animation", "Children\'s", "Comedy", "Crime",
        "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical",
        "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"
    ]
    movie_genre = MD.load_movie_genre_matrix_1m(combine=True)
    user_genre_distr = np.zeros(shape=(6040, movie_genre.shape[1]))
    with open("ml-1m/ratings.dat", 'r') as f:
        for line in f.readlines():
            user_id, movie_id, rating, _ = line.split("::")
            movie_id = int(movie_id) - 1
            user_id = int(user_id) - 1
            user_genre_distr[user_id, :] += movie_genre[movie_id, :]

    loyal_count = 0
    loyal_users = []
    for user_id in user_ids:
        user_id -= 1
        user = user_genre_distr[user_id, :]
        if loyal_percent_upper >= max(user) / sum(user) > loyal_percent_lower:
            loyal_count += 1
            loyal_users.append(user_id + 1)
    #print("For threshold", loyal_percent, ",", loyal_count, "users are considered loyal")
    return loyal_users
コード例 #24
0
def compare_real_fake():
    import RealFakeData as RFD
    real = MD.load_user_item_matrix_1m()
    real = real[0:40, 0:40]
    # fake = load_user_item_matrix_100k()
    # fake = simulate_data(real.shape)
    fake_bm = RFD.load_user_item_matrix_1m_masked(file_index=12)
    fake_bm = fake_bm[0:40, 0:40]

    fake_bmpp = RFD.load_user_item_matrix_1m_masked(file_index=17)
    fake_bmpp = fake_bmpp[00:40, 0:40]
    print(fake_bmpp.shape)

    plt.subplot(3,3,1)
    plt.imshow(real)
    plt.title("real")

    plt.subplot(3,3,4)
    plt.imshow(fake_bm)
    plt.title("fake_bm")

    plt.subplot(3, 3, 7)
    plt.imshow(fake_bmpp)
    plt.title("fake_bmpp")

    plt.subplot(3, 3, 5)
    plt.imshow(real-fake_bm)
    plt.title("real-fake_bm")

    plt.subplot(3, 3, 8)
    plt.imshow(real - fake_bmpp)
    plt.title("real-fake_bmpp")

    plt.show()
コード例 #25
0
ファイル: RealFakeData.py プロジェクト: STrucks/BlurMore
def load_real_fake_data_ML_1m(file_index=24):
    data = []
    real = MD.load_user_item_matrix_1m()
    #real = MD.load_user_item_matrix_100k()
    #real = load_user_item_matrix_1m_masked(file_index=17)
    real = real[0:int(real.shape[0] / 2), :]
    #fake = load_user_item_matrix_100k()
    #fake = simulate_data(real.shape)
    fake = load_user_item_matrix_1m_masked(file_index=file_index)
    #fake = MD.load_user_item_matrix_100k_masked(file_index=1)
    fake = fake[int(fake.shape[0] / 2):, :]
    #fake = real
    #fake = np.random.randint(5, size=real.shape)
    #print(fake)
    data = np.zeros(shape=(real.shape[0] + fake.shape[0], real[0].shape[0]))
    labels = np.zeros(shape=(real.shape[0] + fake.shape[0], ))
    for user_index, user in enumerate(real):
        data[user_index, :] = user
        labels[user_index] = 1
    for user_index, user in enumerate(fake):
        data[len(real) + user_index, :] = user
        labels[len(real) + user_index] = 0

    from Utils import shuffle_two_arrays
    data, labels = shuffle_two_arrays(data, labels)
    return data, labels
コード例 #26
0
def show_correlation_genre():
    genres = ["Action", "Adventure", "Animation", "Children\'s", "Comedy", "Crime", "Documentary", "Drama", "Fantasy",
              "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"]
    movie_genre = MD.load_movie_genre_matrix_1m()
    print(movie_genre.shape)
    cooc = np.zeros(shape=(movie_genre.shape[1], movie_genre.shape[1]))
    # show the simple co-occurrence matrix:
    for movie in movie_genre:
        pairs = []
        for index1 in range(len(movie)):
            if movie[index1] == 1:
                for index2 in range(index1+1, len(movie)):
                    if movie[index2] == 1:
                        pairs.append([index1, index2])
        for one, two in pairs:
            cooc[one, two] += 1
            cooc[two, one] += 1
    plt.rcParams.update({'font.size': 22})

    import seaborn as sb
    fig, ax = plt.subplots()
    ax = sb.heatmap(cooc, linewidths=0.5)

    # We want to show all ticks...
    ax.set_xticks(np.arange(len(genres)))
    ax.set_yticks(np.arange(len(genres)))
    # ... and label them with the respective list entries
    ax.set_xticklabels(genres)
    ax.set_yticklabels(genres)
    plt.setp(ax.get_yticklabels(), rotation=45, ha="right", rotation_mode="anchor")
    plt.setp(ax.get_xticklabels(), rotation=-30,  ha="left", rotation_mode="anchor")
    plt.title("Co-occurrence of movie genres in ML 1m")
    plt.show()
コード例 #27
0
ファイル: DataExploration.py プロジェクト: STrucks/BlurMore
def genre_exploration_1m():
    genres = [
        "Action", "Adventure", "Animation", "Children\'s", "Comedy", "Crime",
        "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical",
        "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"
    ]
    import MovieLensData as MD
    import matplotlib.pyplot as plt
    movie_genre = MD.load_movie_genre_matrix_1m()
    # plot genre frequencies:
    genre_frequency = np.sum(movie_genre, axis=0)
    plt.bar(genres, genre_frequency)
    plt.show()
    print(genre_frequency)

    # number of genres per movie:
    genre_count = np.sum(movie_genre, axis=1)
    #for index, count in enumerate(genre_count):
    #    if count == 0:
    #        print(index)
    import collections
    counter = collections.Counter(genre_count)
    print(counter)
    plt.bar(counter.keys(), counter.values())
    plt.xlabel("#genres")
    plt.ylabel('frequency')
    plt.show()
コード例 #28
0
ファイル: DataExploration.py プロジェクト: STrucks/BlurMore
def test_avg_rating_gender_per_movie_1m():
    import MovieLensData as MD
    from scipy.stats import ttest_ind, mannwhitneyu
    gender_dict = MD.gender_user_dictionary_1m()
    user_item = MD.load_user_item_matrix_1m()

    movies = {}
    with open("ml-1m/movies.dat", 'r') as f:
        for line in f.readlines():
            id, name, genre = line.replace("\n", "").split("::")
            movies[int(id)] = name + "::" + genre
    counter = 0
    print(len(user_item[0]))
    for movie_id in range(len(user_item[0])):
        ratings = user_item[:, movie_id]
        male_ratings = []
        female_ratings = []
        for user_id, rating in enumerate(ratings):
            if rating > 0:
                if gender_dict[user_id] == 'M':
                    male_ratings.append(rating)
                else:
                    female_ratings.append(rating)

        try:
            _, p_value = mannwhitneyu(male_ratings, female_ratings)

            if p_value < 0.05 / len(user_item[0]):
                #print(movie_id+1, "%.2f" % np.average(male_ratings), len(male_ratings), "%.2f" % np.average(female_ratings), len(female_ratings), p_value)
                counter += 1
                #plt.bar(["male", "female"], [np.average(male_ratings), np.average(female_ratings)])
                #plt.show()
                if np.average(male_ratings) > np.average(female_ratings):
                    print(
                        str(movie_id + 1) + "::" + movies[movie_id + 1] +
                        "::M")
                if np.average(male_ratings) < np.average(female_ratings):
                    print(
                        str(movie_id + 1) + "::" + movies[movie_id + 1] +
                        "::F")
        except:
            print("Testing failed for", movie_id)

    print(str(1 + 1) + "::" + movies[1])
    print(counter)
コード例 #29
0
def one_hundert_k(classifier):
    X = MD.load_user_item_matrix_100k()  # max_user=max_user, max_item=max_item)
    #X = MD.load_user_genre_matrix_100k()
    T = MD.load_gender_vector_100k()  # max_user=max_user)
    X_train, T_train = X[0:int(0.8 * len(X))], T[0:int(0.8 * len(X))]
    X_test, T_test = X[int(0.8 * len(X)):], T[int(0.8 * len(X)):]

    # print(X)
    print(X_train.shape)
    # X = Utils.remove_significant_features(X, T)
    #X_train = Utils.random_forest_selection(X_train, T_train)
    # X = feature_selection(X, T, Utils.select_male_female_different)
    print(X_train.shape)

    # X = Utils.normalize(X)
    # X = Utils.standardize(X)
    # X = chi2_selection(X, T)

    classifier(X_train, T_train)
コード例 #30
0
def real_vs_fake():
    X, T = RFData.load_real_fake_data_ML_1m(file_index=49)
    #X, T = RFData.load_real_fake_data_ML_100k()
    #print(type(Y[0]))
    # Classifiers.log_reg(X, Y)
    X_train, T_train = X[0:int(0.8 * len(X))], T[0:int(0.8 * len(X))]
    X_test, T_test = X[int(0.8 * len(X)):], T[int(0.8 * len(X)):]

    Classifiers.log_reg(X_train, T_train)

    from sklearn.model_selection import StratifiedKFold
    from sklearn.linear_model import LogisticRegression

    cv = StratifiedKFold(n_splits=10)
    coefs = []
    avg_coefs = np.zeros(shape=(len(X_train[1]),))

    random_state = np.random.RandomState(0)
    for train, test in cv.split(X_train, T_train):
        x, t = X_train[train], T_train[train]
        model = LogisticRegression(penalty='l2', random_state=random_state)
        model.fit(x, t)
        # rank the coefs:
        ranks = ss.rankdata(model.coef_[0])
        coefs.append(ranks)
        # print(len(model.coef_[0]),len(X_train[0]))
        avg_coefs += model.coef_[0]

    coefs = np.average(coefs, axis=0)
    coefs = [[coefs[i], i, avg_coefs[i]] for i in range(len(coefs))]
    coefs = np.asarray(list(sorted(coefs)))

    values = coefs[:, 2]
    index_zero = np.where(values == np.min(np.abs(values)))
    top_male = index_zero[0][0]
    top_female = index_zero[0][-1]
    L_m = coefs[:top_male, 1]
    R_m = 3952 - coefs[:top_male, 0]
    C_m = np.abs(coefs[:top_male, 2])
    L_f = coefs[coefs.shape[0] - top_female:, 1]
    L_f = list(reversed(L_f))
    R_f = coefs[coefs.shape[0] - top_female:, 0]
    R_f = list(reversed(R_f))
    C_f = coefs[coefs.shape[0] - top_female:, 2]
    C_f = list(reversed(np.abs(C_f)))
    id_index, index_id = MD.load_movie_id_index_dict()
    movies = []
    with open("ml-1m/movies.dat", 'r') as f:
        for line in f.readlines():
            movies.append(line.replace("\n", ""))

    for index, val in enumerate(L_m[0:min(10,len(L_m))]):
        print(index, movies[id_index[int(val)+1]], C_m[index])
    for index, val in enumerate(L_f[0:min(10,len(L_f))]):
        print(index, movies[id_index[int(val)+1]], C_f[index])