コード例 #1
0
ファイル: Obfuscation.py プロジェクト: STrucks/BlurMore
def rating_add_1m():
    # add a percentage of random ratings to a user:
    X = MD.load_user_item_matrix_1m()
    X_obf = MD.load_user_item_matrix_1m()
    percentage = 0.05
    for user_index, user in enumerate(X):
        nr_ratings = 0
        for rating in user:
            if rating > 0:
                nr_ratings += 1

        added = 0
        safety_counter = 0
        while added < nr_ratings*percentage and safety_counter < 100:
            index = np.random.randint(0,len(user))
            if X_obf[user_index, index] > 0:
                safety_counter += 1
                continue
            else:
                X_obf[user_index, index] = np.random.randint(1,6)

    # output the data in a file:
    with open("ml-1m/random_added_obfuscated_" + str(percentage) + ".dat", 'w') as f:
        for index_user, user in enumerate(X_obf):
            for index_movie, rating in enumerate(user):
                if rating > 0:
                    f.write(str(index_user + 1) + "::" + str(index_movie + 1) + "::" + str(
                        int(rating)) + "::000000000\n")
    return X_obf
コード例 #2
0
def comp_BM_and_BMpp():
    plt.rcParams.update({'font.size': 28})
    f, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=True)
    interval_start, interval_end = 0, 50
    X = MD.load_user_item_matrix_1m()
    X_movie_count = [sum([1 if x > 0 else 0 for x in movie]) for movie in np.transpose(X)[interval_start:interval_end]]
    movie_count_o = np.asarray(X_movie_count)
    ax1.bar(range(interval_start,interval_end), X_movie_count)
    ax1.set_title("(A)\nOriginal data")
    ax1.set_xlabel("movie ID")
    ax1.set_ylabel("#ratings")
    #ax1.set_xticks(range(1,6), [1,2,3,4,5])
    print("Original Data:", sum(X_movie_count))

    X = MD.load_user_item_matrix_1m_masked(file_index=63)
    X_movie_count = [sum([1 if x > 0 else 0 for x in movie]) for movie in np.transpose(X)[interval_start:interval_end]]
    masked_count = np.asarray(X_movie_count)
    ax2.bar(range(interval_start, interval_end), X_movie_count)
    ax2.set_title("(B)\nBlurMe data")
    ax2.set_xlabel("movie ID")
    ax2.set_ylabel("#ratings")
    print("BlurMe Data:", sum(X_movie_count))

    X = MD.load_user_item_matrix_1m_masked(file_index=75)
    X_movie_count = [sum([1 if x > 0 else 0 for x in movie]) for movie in np.transpose(X)[interval_start:interval_end]]
    masked_count2 = np.asarray(X_movie_count)
    ax3.bar(range(interval_start, interval_end), X_movie_count)
    ax3.set_title("(C)\nBlurM(or)e data")
    ax3.set_xlabel("movie ID")
    ax3.set_ylabel("#ratings")
    print("BlurMe++ Data:", sum(X_movie_count))
    print(movie_count_o-masked_count)
    print(masked_count-masked_count2)
    plt.show()
コード例 #3
0
ファイル: RealFakeData.py プロジェクト: STrucks/BlurMore
def load_real_fake_data_ML_1m(file_index=24):
    data = []
    real = MD.load_user_item_matrix_1m()
    #real = MD.load_user_item_matrix_100k()
    #real = load_user_item_matrix_1m_masked(file_index=17)
    real = real[0:int(real.shape[0] / 2), :]
    #fake = load_user_item_matrix_100k()
    #fake = simulate_data(real.shape)
    fake = load_user_item_matrix_1m_masked(file_index=file_index)
    #fake = MD.load_user_item_matrix_100k_masked(file_index=1)
    fake = fake[int(fake.shape[0] / 2):, :]
    #fake = real
    #fake = np.random.randint(5, size=real.shape)
    #print(fake)
    data = np.zeros(shape=(real.shape[0] + fake.shape[0], real[0].shape[0]))
    labels = np.zeros(shape=(real.shape[0] + fake.shape[0], ))
    for user_index, user in enumerate(real):
        data[user_index, :] = user
        labels[user_index] = 1
    for user_index, user in enumerate(fake):
        data[len(real) + user_index, :] = user
        labels[len(real) + user_index] = 0

    from Utils import shuffle_two_arrays
    data, labels = shuffle_two_arrays(data, labels)
    return data, labels
コード例 #4
0
def one_million(classifier):
    max_user = 6040
    max_item = 3952
    X = MD.load_user_item_matrix_1m()  # max_user=max_user, max_item=max_item)
    T = MD.load_user_genre_matrix_1m(one_hot=True, top=1)
    T = np.argwhere(T == 1)[:, 1]
    print(min(T), max(T))
    """
    Note that we loose class 13 (Romance. it seems that no one has romance as favourite genre. This kinda makes sense 
    because it correlates so much with drama and comedy.
    """
    import collections
    import matplotlib.pyplot as plt
    counter = collections.Counter(T)
    #plt.bar(counter.keys(), counter.values())
    #plt.xlabel("T")
    #plt.ylabel('frequency')
    #plt.show()
    print(counter)

    X = Utils.normalize(X)
    #print(T)
    #X = MD.feature_selection(X, T, f_regression)
    #X = MD.chi2_selection(X, T)
    classifier(X, T, multiclass=True, nr_classes=17)
コード例 #5
0
def compare_real_fake():
    import RealFakeData as RFD
    real = MD.load_user_item_matrix_1m()
    real = real[0:40, 0:40]
    # fake = load_user_item_matrix_100k()
    # fake = simulate_data(real.shape)
    fake_bm = RFD.load_user_item_matrix_1m_masked(file_index=12)
    fake_bm = fake_bm[0:40, 0:40]

    fake_bmpp = RFD.load_user_item_matrix_1m_masked(file_index=17)
    fake_bmpp = fake_bmpp[00:40, 0:40]
    print(fake_bmpp.shape)

    plt.subplot(3,3,1)
    plt.imshow(real)
    plt.title("real")

    plt.subplot(3,3,4)
    plt.imshow(fake_bm)
    plt.title("fake_bm")

    plt.subplot(3, 3, 7)
    plt.imshow(fake_bmpp)
    plt.title("fake_bmpp")

    plt.subplot(3, 3, 5)
    plt.imshow(real-fake_bm)
    plt.title("real-fake_bm")

    plt.subplot(3, 3, 8)
    plt.imshow(real - fake_bmpp)
    plt.title("real-fake_bmpp")

    plt.show()
コード例 #6
0
def one_million_obfuscated(classifier):
    #X2 = MD.load_user_item_matrix_1m()  # max_user=max_user, max_item=max_item)
    T = MD.load_gender_vector_1m()  # max_user=max_user)
    X1 = MD.load_user_item_matrix_1m()
    X2 = MD.load_user_item_matrix_1m_masked(
        file_index=55)  # max_user=max_user, max_item=max_item)
    #X2 = X1
    print(X1.shape, X2.shape)

    #X1, T = Utils.balance_data(X1, T)
    #X2, T2 = Utils.balance_data(X2, T)
    #X1 = Utils.normalize(X1)
    #X2 = Utils.normalize(X2)
    X_train, T_train = X1[0:int(0.8 * len(X1))], T[0:int(0.8 * len(X1))]
    X_test, T_test = X2[int(0.8 * len(X2)):], T[int(0.8 * len(X2)):]
    print(list(X1[0, :]))
    print(list(X2[0, :]))
    # print(X)
    print("before", X_train.shape)
    # X = Utils.remove_significant_features(X, T)
    # X_train, _ = Utils.random_forest_selection(X_train, T_train)
    # X = feature_selection(X, T, Utils.select_male_female_different)
    print(X_train.shape)
    from sklearn.linear_model import LogisticRegression
    random_state = np.random.RandomState(0)
    model = LogisticRegression(penalty='l2', random_state=random_state)

    Utils.ROC_cv_obf(X1, X2, T, model)

    model = LogisticRegression(penalty='l2', random_state=random_state)
コード例 #7
0
def one_million(classifier):
    X = MD.load_user_item_matrix_1m()  # max_user=max_user, max_item=max_item)
    #X = MD.load_user_item_matrix_1m_limited_ratings(limit=1)
    #X = MD.load_user_item_matrix_1m_binary()

    # X = MD.load_user_genre_matrix_100k_obfuscated()
    T = MD.load_gender_vector_1m()  # max_user=max_user)
    #X, T = Utils.balance_data(X, T)

    #X = Utils.normalize(X)
    X = feature_selection(X, T, Utils.select_male_female_different)
    X_train, T_train = X[0:int(0.8 * len(X))], T[0:int(0.8 * len(X))]
    X_test, T_test = X[int(0.8 * len(X)):], T[int(0.8 * len(X)):]

    # print(X)
    print("before", X_train.shape)
    # X = Utils.remove_significant_features(X, T)
    #X_train, _ = Utils.random_forest_selection(X_train, T_train)
    # X = feature_selection(X, T, Utils.select_male_female_different)
    print(X_train.shape)

    # X = Utils.normalize(X)
    # X = Utils.standardize(X)
    # X = chi2_selection(X, T)

    classifier(X_train, T_train)
    from sklearn.linear_model import LogisticRegression
    random_state = np.random.RandomState(0)
    #model = Models.Dominant_Class_Classifier()
    model = LogisticRegression(penalty='l2', random_state=random_state)
    model.fit(X_train, T_train)
    Utils.ROC_plot(X_test, T_test, model)
コード例 #8
0
def feature_importance_1m():
    plt.rcParams.update({'font.size': 18})

    from sklearn.ensemble import RandomForestClassifier
    from sklearn.linear_model import LogisticRegression
    import seaborn as sns
    X = MD.load_user_item_matrix_1m()
    T = MD.load_gender_vector_1m()
    importance = np.zeros(shape=(X.shape[1],))
    importance2 = np.zeros(shape=(X.shape[1],))

    for i in range(10):
        model = LogisticRegression()
        model2 = RandomForestClassifier()
        model.fit(X, T)
        model2.fit(X,T)
        importance += model.coef_[0]
        importance2 += model2.feature_importances_
    importance /= 10
    importance2 /= 10

    #plt.bar(range(1,len(importance[0:30])+1), importance[0:30])
    #plt.xlabel("movie index")
    #plt.ylabel("importance")
    #plt.show()
    #sns.distplot(importance, kde=False)
    plt.hist(importance2, bins=np.linspace(0,0.001,50))
    #sns.kdeplot(importance,shade=True,cut=0)
    #sns.rugplot(importance)
    plt.xlabel("importance")
    plt.ylabel("frequency")
    plt.title("Importance of movies distribution")
    plt.show()
    importance_id = zip(importance, range(1,len(importance)+1))
    importance_id = list(reversed(sorted(importance_id)))
    importance_id2 = zip(importance, range(1, len(importance) + 1))
    importance_id2 = list(sorted(importance_id2))
    importance_id3 = zip(importance2, range(1, len(importance2) + 1))
    importance_id3 = list(reversed(sorted(importance_id3)))
    set1 = set()
    set2 = set()
    set3 = set()

    names = MD.load_movie_id_dictionary_1m()
    top = 100
    for (_, id), (_,id2), (_,id3) in zip(importance_id[0:top], importance_id2[0:top], importance_id3[0:top]):
        print(names[id], "|", names[id2], "|", names[id3])
        set1.add(names[id])
        set2.add(names[id2])
        set3.add(names[id3])

    #print(set3)

    print(set3.intersection(set2.union(set1)))

    #print(importance_id)
    """
コード例 #9
0
def rating_distr():
    T = MD.load_gender_vector_1m()
    X = MD.load_user_item_matrix_1m()
    f, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=True)
    frequencies = np.zeros(shape=(6,))
    ratings = []
    movie_ids = []
    for user in X:
        for index, rating in enumerate(user):
            frequencies[int(rating)] += 1
            if rating > 0:
                movie_ids.append(index+1)
                ratings.append(rating)
    print(frequencies)
    print(sum(frequencies[1:]), np.mean(frequencies[1:]), np.var(frequencies[1:]))
    print("mean:", np.dot(np.arange(0, 6), frequencies) / sum(frequencies), "without 0:",
          np.dot(np.arange(1, 6), frequencies[1:]) / sum(frequencies[1:]))
    print("Avg", np.average(ratings), "var", np.var(ratings))
    print(len(set(movie_ids)))
    ax1.bar(range(5), frequencies[1:])
    ax1.set_xlabel("Original")
    X = MD.load_user_item_matrix_1m_masked(file_index=71)# greedy 10%
    frequencies = np.zeros(shape=(6,))
    ratings = []

    for user in X:
        for rating in user:
            frequencies[int(rating)] += 1
            if rating > 0:
                ratings.append(rating)

    print(frequencies)
    print(sum(frequencies[1:]), np.mean(frequencies[1:]), np.var(frequencies[1:]))

    print("mean:", np.dot(np.arange(0, 6), frequencies) / sum(frequencies), "without 0:",
          np.dot(np.arange(1, 6), frequencies[1:]) / sum(frequencies[1:]))
    print("Avg", np.average(ratings), "var", np.var(ratings))

    ax2.bar(range(5), frequencies[1:])
    ax2.set_xlabel("BlurMe")
    X = MD.load_user_item_matrix_1m_masked(file_index=55)# BlurMe++ 10%, fac=2
    frequencies = np.zeros(shape=(6,))
    ratings = []
    for user in X:
        for rating in user:
            frequencies[int(rating)] += 1
            if rating > 0:
                ratings.append(rating)
    print(frequencies)
    print(sum(frequencies[1:]), np.mean(frequencies[1:]), np.var(frequencies[1:]))
    print("mean:", np.dot(np.arange(0, 6), frequencies) / sum(frequencies), "without 0:",
          np.dot(np.arange(1, 6), frequencies[1:]) / sum(frequencies[1:]))
    print("Avg", np.average(ratings), "var", np.var(ratings))

    ax3.bar(range(5), frequencies[1:])
    ax3.set_xlabel("BlurMe++")
    plt.show()
コード例 #10
0
ファイル: DataExploration.py プロジェクト: STrucks/BlurMore
def feature_importance_1m():
    from sklearn.ensemble import RandomForestClassifier
    X = MD.load_user_item_matrix_1m()
    T = MD.load_gender_vector_1m()
    importance = np.zeros(shape=(X.shape[1], ))
    for i in range(10):
        model = RandomForestClassifier()
        model.fit(X, T)
        importance += model.feature_importances_
    importance /= 10
    plt.bar(range(1, len(importance[0:30]) + 1), importance[0:30])
    plt.xlabel("movie index")
    plt.ylabel("importance")
    plt.show()

    counter = 0
    for movie, score in enumerate(importance):
        if score >= 0.002:
            print(movie + 1, end=",")
            counter += 1
    print()
    print(counter)
    nr_ratings = np.zeros(shape=(X.shape[1], ))
    for index, movie in enumerate(np.transpose(X)):
        counter = 0
        for rating in movie:
            if rating > 0:
                counter += 1
        nr_ratings[index] = counter

    avg_nr_per_importance = {}
    nr_ratings_importance = []
    for nr, imp in zip(nr_ratings, importance):
        if imp in avg_nr_per_importance:
            avg_nr_per_importance[imp].append(nr)
        else:
            avg_nr_per_importance[imp] = [nr]
        nr_ratings_importance.append([nr, imp])

    #for key in avg_nr_per_importance.keys():
    #    avg_nr_per_importance[key] = np.average(avg_nr_per_importance[key])
    #print(avg_nr_per_importance)
    plt.subplot(1, 2, 1)
    for nr, imp in nr_ratings_importance:
        plt.scatter(nr, imp)
    plt.xlabel("#ratings")
    plt.ylabel("importance")

    plt.subplot(1, 2, 2)
    for nr, imp in nr_ratings_importance:
        if nr < 100:
            plt.scatter(nr, imp)
    plt.xlabel("#ratings")
    plt.ylabel("importance")

    plt.show()
コード例 #11
0
def one_million(classifier):
    max_user = 6040
    max_item = 3952
    X = MD.load_user_item_matrix_1m()  # max_user=max_user, max_item=max_item)
    T = MD.load_occupation_vector_1m()  # max_user=max_user)
    X = Utils.normalize(X)
    #print(T)
    #X = MD.feature_selection(X, T, f_regression)
    #X = MD.chi2_selection(X, T)
    classifier(X, T, multiclass=True)
コード例 #12
0
def one_million(classifier):
    max_user = 6040
    max_item = 3952
    X = MD.load_user_item_matrix_1m()  # max_user=max_user, max_item=max_item)
    T = MD.load_age_vector_1m(border=30)  # max_user=max_user)
    #X = normalize(X)
    print(min(X[:,0]), np.mean(X[:,0]))
    #X = feature_selection(X, T, f_regression)
    #X = chi2_selection(X, T)
    classifier(X, T)
コード例 #13
0
def movie_ratings():
    X = MD.load_user_item_matrix_1m()
    freqs = []
    for movie in np.transpose(X):
        freq = 0
        for rating in movie:
            if rating > 0:
                freq+=1
        freqs.append(freq)
    print(list(sorted(freqs)))
    print(set(freqs))
コード例 #14
0
ファイル: FailureAnalysis.py プロジェクト: STrucks/BlurMore
def lot_ratings():
    X = MD.load_user_item_matrix_1m()
    X = Utils.normalize(X)
    T = MD.load_gender_vector_1m()
    X_train, T_train = X[0:int(0.8 * len(X))], T[0:int(0.8 * len(X))]
    X_test, T_test = X[int(0.8 * len(X)):], T[int(0.8 * len(X)):]
    test_data = list(zip(X_test, T_test))

    from sklearn.linear_model import LogisticRegression

    random_state = np.random.RandomState(0)
    model = LogisticRegression(penalty='l2', random_state=random_state)
    model.fit(X_train, T_train)
    # Utils.ROC_plot(X_test, T_test, model)
    roc = True
    min_rating = [162, 211, 282, 390]

    for index, max_rating in enumerate([210, 281, 389, 1000]):
        selected_X = []
        selected_T = []
        for user, label in test_data:
            counter = 0
            for rating in user:
                if rating > 0:
                    counter += 1
            if min_rating[index] <= counter <= max_rating:
                selected_X.append(user)
                selected_T.append(label)
        probs = model.predict_proba(selected_X)
        preds = probs[:, 1]
        fpr, tpr, threshold = metrics.roc_curve(selected_T, preds)
        roc_auc = metrics.auc(fpr, tpr)

        if roc:
            # method I: plt
            plt.subplot(2, 2, index + 1)
            plt.title(
                'Receiver Operating Characteristic with users having rated between ' + str(max_rating) + " and " + str(
                    min_rating[index]) + ' making N=' + str(len(selected_X)))
            plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % roc_auc)
            plt.legend(loc='lower right')
            plt.plot([0, 1], [0, 1], 'r--')
            plt.xlim([0, 1])
            plt.ylim([0, 1])
            plt.ylabel('True Positive Rate')
            plt.xlabel('False Positive Rate')
        # print the confusion matrix:
        print("For max rating =", max_rating, ":")
        Y = model.predict(selected_X)
        TPR, TNR, FPR, FNR, precision, accuracy = Utils.performance_measures(Y, T)
        print("TPR:", TPR, "TNR:", TNR, "FPR:", FPR, "FNR:", FNR, "precision:", precision, "accuracy:", accuracy)

    if roc:
        plt.show()
コード例 #15
0
def rating_exploration():
    X = MD.load_user_item_matrix_1m()
    rating_distr = {}
    for index, user in enumerate(X):
        nr_ratings = 0
        for rating in user:
            if rating > 0:
                nr_ratings += 1
        if nr_ratings == 0:
            print(index, user)
        if nr_ratings > 200:
            continue
        if nr_ratings in rating_distr:
            rating_distr[nr_ratings] += 1
        else:
            rating_distr[nr_ratings] = 1
    print(rating_distr)
    plt.rcParams.update({'font.size': 22})
    plt.bar(rating_distr.keys(), rating_distr.values())
    plt.xlabel("#ratings per user")
    plt.ylabel("frequency")
    plt.show()
    rating_distr = {}
    for index, user in enumerate(X):
        for rating in user:
            if rating not in rating_distr:
                rating_distr[rating] = 1
            else:
                rating_distr[rating] += 1
    print(rating_distr)
    print(X.shape[0]*X.shape[1])
    plt.bar(rating_distr.keys(), rating_distr.values())
    plt.show()

    rating_distr = {}
    X = np.transpose(X)
    for index, item in enumerate(X):
        nr_ratings = 0
        for rating in item:
            if rating > 0:
                nr_ratings += 1
        if nr_ratings == 0:
            print(index, item)
            continue
        if nr_ratings in rating_distr:
            rating_distr[nr_ratings] += 1
        else:
            rating_distr[nr_ratings] = 1
    print(rating_distr)
    plt.bar(rating_distr.keys(), rating_distr.values())
    plt.show()
コード例 #16
0
def show_avg_rating_gender_per_movie(movie_id=1):
    gender_dict = MD.gender_user_dictionary_1m()
    user_item = MD.load_user_item_matrix_1m()
    ratings = user_item[:, movie_id]
    male_ratings = []
    female_ratings = []
    for user_id, rating in enumerate(ratings):
        if rating > 0:
            if gender_dict[user_id] == 'M':
                male_ratings.append(rating)
            else:
                female_ratings.append(rating)

    plt.bar(["male", "female"], [np.average(male_ratings), np.average(female_ratings)])
    plt.show()
コード例 #17
0
ファイル: FailureAnalysis.py プロジェクト: STrucks/BlurMore
def loyal_ratings():
    X = MD.load_user_item_matrix_1m()
    X = Utils.normalize(X)
    T = MD.load_gender_vector_1m()
    X_train, T_train = X[0:int(0.8 * len(X))], T[0:int(0.8 * len(X))]
    X_test, T_test = X[int(0.8 * len(X)):], T[int(0.8 * len(X)):]
    test_data = list(zip(X_test, T_test))
    Test_indecies = range(int(0.8 * len(X)), len(X))
    print(len(X_test))
    from sklearn.linear_model import LogisticRegression

    random_state = np.random.RandomState(0)
    model = LogisticRegression(penalty='l2', random_state=random_state)
    model.fit(X_train, T_train)
    # Utils.ROC_plot(X_test, T_test, model)
    roc = True
    upper_bound = [0.17,0.19,0.42,1]
    for index, percent_loyal in enumerate([0.0, 0.17, 0.35, 0.42]):
        test_ids = [i+1 for i in Test_indecies]
        selected_ids = Utils.is_loyal(test_ids, loyal_percent_lower=percent_loyal, loyal_percent_upper=upper_bound[index])
        selected_indecies = [i-1 for i in selected_ids]
        selected_X = X[selected_indecies]
        selected_T = T[selected_indecies]

        probs = model.predict_proba(selected_X)
        preds = probs[:, 1]
        fpr, tpr, threshold = metrics.roc_curve(selected_T, preds)
        roc_auc = metrics.auc(fpr, tpr)

        if roc:
            # method I: plt
            plt.subplot(2, 2, index + 1)
            plt.title('Receiver Operating Characteristic with users having a loyality between ' + str(
                percent_loyal) + ' and ' + str(upper_bound[index]) + ' making N=' + str(len(selected_X)))
            plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % roc_auc)
            plt.legend(loc='lower right')
            plt.plot([0, 1], [0, 1], 'r--')
            plt.xlim([0, 1])
            plt.ylim([0, 1])
            plt.ylabel('True Positive Rate')
            plt.xlabel('False Positive Rate')
        # print the confusion matrix:
        print("For loyality =", percent_loyal, ":")
        Y = model.predict(selected_X)
        TPR, TNR, FPR, FNR, precision, accuracy = Utils.performance_measures(Y, T)
        print("TPR:", TPR, "TNR:", TNR, "FPR:", FPR, "FNR:", FNR, "precision:", precision, "accuracy:", accuracy)
    if roc:
        plt.show()
コード例 #18
0
ファイル: DataExploration.py プロジェクト: STrucks/BlurMore
def rating_exploration_100k():
    X = MD.load_user_item_matrix_1m()
    rating_distr = {}
    for index, user in enumerate(X):
        nr_ratings = 0
        for rating in user:
            if rating > 0:
                nr_ratings += 1
        if nr_ratings == 0:
            print(index, user)
        if nr_ratings in rating_distr:
            rating_distr[nr_ratings] += 1
        else:
            rating_distr[nr_ratings] = 1
    print(rating_distr)
    plt.bar(rating_distr.keys(), rating_distr.values())
    plt.show()
    rating_distr = {}
    for index, user in enumerate(X):
        for rating in user:
            if rating not in rating_distr:
                rating_distr[rating] = 1
            else:
                rating_distr[rating] += 1
    print(rating_distr)
    print(X.shape[0] * X.shape[1])
    plt.bar(rating_distr.keys(), rating_distr.values())
    plt.show()

    rating_distr = {}
    X = np.transpose(X)
    for index, item in enumerate(X):
        nr_ratings = 0
        for rating in item:
            if rating > 0:
                nr_ratings += 1
        if nr_ratings == 0:
            print(index, item)
            continue
        if nr_ratings in rating_distr:
            rating_distr[nr_ratings] += 1
        else:
            rating_distr[nr_ratings] = 1
    print(rating_distr)
    plt.bar(rating_distr.keys(), rating_distr.values())
    plt.show()
コード例 #19
0
ファイル: DataExploration.py プロジェクト: STrucks/BlurMore
def test_avg_rating_gender_per_movie_1m():
    import MovieLensData as MD
    from scipy.stats import ttest_ind, mannwhitneyu
    gender_dict = MD.gender_user_dictionary_1m()
    user_item = MD.load_user_item_matrix_1m()

    movies = {}
    with open("ml-1m/movies.dat", 'r') as f:
        for line in f.readlines():
            id, name, genre = line.replace("\n", "").split("::")
            movies[int(id)] = name + "::" + genre
    counter = 0
    print(len(user_item[0]))
    for movie_id in range(len(user_item[0])):
        ratings = user_item[:, movie_id]
        male_ratings = []
        female_ratings = []
        for user_id, rating in enumerate(ratings):
            if rating > 0:
                if gender_dict[user_id] == 'M':
                    male_ratings.append(rating)
                else:
                    female_ratings.append(rating)

        try:
            _, p_value = mannwhitneyu(male_ratings, female_ratings)

            if p_value < 0.05 / len(user_item[0]):
                #print(movie_id+1, "%.2f" % np.average(male_ratings), len(male_ratings), "%.2f" % np.average(female_ratings), len(female_ratings), p_value)
                counter += 1
                #plt.bar(["male", "female"], [np.average(male_ratings), np.average(female_ratings)])
                #plt.show()
                if np.average(male_ratings) > np.average(female_ratings):
                    print(
                        str(movie_id + 1) + "::" + movies[movie_id + 1] +
                        "::M")
                if np.average(male_ratings) < np.average(female_ratings):
                    print(
                        str(movie_id + 1) + "::" + movies[movie_id + 1] +
                        "::F")
        except:
            print("Testing failed for", movie_id)

    print(str(1 + 1) + "::" + movies[1])
    print(counter)
コード例 #20
0
ファイル: Obfuscation.py プロジェクト: STrucks/BlurMore
def rating_swap_1m():
    plot = False
    low_bound, high_bound = 100, 1500
    # swap 0 ratings with non zero ratings:
    X = np.transpose(MD.load_user_item_matrix_1m())
    X_obf = np.transpose(MD.load_user_item_matrix_1m())
    nr_ratings = []
    for item in X:
        nr_rating = 0
        for rating in item:
            if rating > 0:
                nr_rating += 1
        nr_ratings.append(nr_rating)

    fig, (ax1, ax2) = plt.subplots(ncols=2, sharey=True)
    if plot:
        #plt.subplot(1,2,1)
        ax1.bar(range(1,len(X)+1), nr_ratings)
        ax1.set_xlabel("movie id")
        ax1.set_ylabel("nr ratings")

    # we want to remove ratings from movies that have more than 1500 ratings:
    amount_removed = 0
    for item_index, item in enumerate(X):
        if nr_ratings[item_index] > high_bound:
            indecies = np.argwhere(X[item_index,:] > 0)[:,0]
            indecies = np.random.choice(indecies, size=(nr_ratings[item_index]-high_bound,), replace=False)
            amount_removed += len(indecies)
            for i in indecies:
                X_obf[item_index, i] = 0
    """ To check if the removal is working
    
    nr_ratings = []
    for item in X_obf:
        nr_rating = 0
        for rating in item:
            if rating > 0:
                nr_rating += 1
        nr_ratings.append(nr_rating)
    if plot:
        plt.bar(range(1, len(X) + 1), nr_ratings)
        plt.xlabel("movie id")
        plt.ylabel("nr ratings")
        plt.show()
    
    """
    # now we want to add ratings to movies with a small number of ratings:
    print(np.asarray(nr_ratings))
    indecies = np.argwhere(np.asarray(nr_ratings) < low_bound)[:,0]
    print(indecies)
    nr_few_rated_movies = len(indecies)
    nr_to_be_added = amount_removed/nr_few_rated_movies
    print(nr_to_be_added)
    for item_index, item in enumerate(X):
        if nr_ratings[item_index] < low_bound:
            indecies = np.argwhere(X[item_index,:] == 0)[:,0]
            indecies = np.random.choice(indecies, size=(int(nr_to_be_added),), replace=False)
            for i in indecies:
                X_obf[item_index, i] = np.random.randint(1,6)

    """ To check if the removal and adding is working
    """
    nr_ratings = []
    for item in X_obf:
        nr_rating = 0
        for rating in item:
            if rating > 0:
                nr_rating += 1
        nr_ratings.append(nr_rating)
    if plot:
        #plt.subplot(1,2,2)
        ax2.bar(range(1, len(X) + 1), nr_ratings)
        ax2.set_xlabel("movie id")
        ax2.set_ylabel("nr ratings")
        plt.show()

    X_obf = np.transpose(X_obf)

    # output the data in a file:
    with open("ml-1m/rebalanced_(" + str(low_bound) + "," + str(high_bound) + ").dat", 'w') as f:
        for index_user, user in enumerate(X_obf):
            for index_movie, rating in enumerate(user):
                if rating > 0:
                    f.write(str(index_user + 1) + "::" + str(index_movie + 1) + "::" + str(
                        int(rating)) + "::000000000\n")

    return X_obf
コード例 #21
0
ファイル: Obfuscation.py プロジェクト: STrucks/BlurMore
def blurMe_1m():
    sample_mode = list(['random', 'sampled', 'greedy'])[2]
    rating_mode = list(['highest', 'avg', 'pred'])[1]
    top = 10
    X = MD.load_user_item_matrix_1m()  # max_user=max_user, max_item=max_item)
    #X = Utils.normalize(X)
    avg_ratings = np.zeros(shape=X.shape[1])
    for item_id in range(X.shape[1]):
        ratings = []
        for rating in X[:, item_id]:
            if rating > 0:
                ratings.append(rating)
        if len(ratings) == 0:
            avg_ratings[item_id] = 0
        else:
            avg_ratings[item_id] = np.average(ratings)


    # 1: get the set of most correlated movies, L_f and L_m:
    T = MD.load_gender_vector_1m()  # max_user=max_user)
    X_train, T_train = X[0:int(0.8 * len(X))], T[0:int(0.8 * len(X))]
    X_test, T_test = X[int(0.8 * len(X)):], T[int(0.8 * len(X)):]

    from sklearn.model_selection import StratifiedKFold
    from sklearn.linear_model import LogisticRegression

    cv = StratifiedKFold(n_splits=10)
    coefs = []

    for train, test in cv.split(X_train, T_train):
        x, t = X_train[train], T_train[train]
        random_state = np.random.RandomState(0)
        model = LogisticRegression(penalty='l2', random_state=random_state)
        model.fit(x, t)
        # rank the coefs:
        ranks = ss.rankdata(model.coef_[0])
        coefs.append(ranks)

    coefs = np.average(coefs, axis=0)
    coefs = [[coefs[i], i+1] for i in range(len(coefs))]
    coefs = np.asarray(list(sorted(coefs)))
    L_m = coefs[:top, 1]
    L_f = coefs[coefs.shape[0]-top:, 1]
    L_f = list(reversed(L_f))
    """
    movie_dict = MD.load_movie_id_dictionary_1m()
    print("males")
    for id in L_m:
        print(movie_dict[int(id)])

    print("females")
    for id in L_f:
        print(movie_dict[int(id)])
    """

    # Now, where we have the two lists, we can start obfuscating the data:
    X = MD.load_user_item_matrix_1m()
    X_obf = MD.load_user_item_matrix_1m()
    p = 0.05
    prob_m = [p / sum(L_m) for p in L_m]
    prob_f = [p / sum(L_f) for p in L_f]
    for index, user in enumerate(X):
        k = 0
        for rating in user:
            if rating > 0:
                k += 1
        k *= p
        greedy_index = 0
        #print(k)
        if T[index] == 1:
            added = 0
            safety_counter = 0
            while added < k and safety_counter < 100:
                # select a random movie:
                if sample_mode == 'random':
                    movie_id = L_m[np.random.randint(0, len(L_m))]
                elif sample_mode == 'sampled':
                    movie_id = L_m[np.random.choice(range(len(L_m)), p=prob_m)]
                elif sample_mode == 'greedy':
                    movie_id = L_m[greedy_index]
                    greedy_index += 1
                    if greedy_index >= len(L_m):
                        safety_counter = 100
                if X_obf[index, int(movie_id)-1] == 0:
                    if rating_mode == 'higest':
                        X_obf[index, int(movie_id) - 1] = 5
                    elif rating_mode == 'avg':
                        X_obf[index, int(movie_id) - 1] = avg_ratings[int(movie_id)]
                    added += 1
                safety_counter += 1
        elif T[index] == 0:
            added = 0
            safety_counter = 0
            while added < k and safety_counter < 100:
                # select a random movie:
                if sample_mode == 'random':
                    movie_id = L_f[np.random.randint(0, len(L_f))]
                elif sample_mode == 'sampled':
                    movie_id = L_f[np.random.choice(range(len(L_f)), p=prob_f)]
                elif sample_mode == 'greedy':
                    movie_id = L_f[greedy_index]
                    greedy_index += 1
                    if greedy_index >= len(L_f):
                        safety_counter = 100
                if X_obf[index, int(movie_id) - 1] == 0:
                    if rating_mode == 'higest':
                        X_obf[index, int(movie_id) - 1] = 5
                    elif rating_mode == 'avg':
                        X_obf[index, int(movie_id) - 1] = avg_ratings[int(movie_id)]
                    added += 1
                safety_counter += 1

    # output the data in a file:
    with open("ml-1m/blurme_obfuscated_" + str(p) + "_" + sample_mode + "_" + rating_mode + ".dat", 'w') as f:
        for index_user, user in enumerate(X_obf):
            for index_movie, rating in enumerate(user):
                if rating > 0:
                    f.write(str(index_user+1) + "::" + str(index_movie+1) + "::" + str(int(rating)) + "::000000000\n")
    return X_obf
コード例 #22
0
def find_good_threshold():
    plt.rcParams.update({'font.size': 18})
    import Classifiers
    import Utils
    max_user = 6040
    max_item = 3952
    # X = MD.load_user_item_matrix_1m_limited_ratings(limit=200)  # max_user=max_user, max_item=max_item)
    X = MD.load_user_item_matrix_1m()
    T = MD.load_gender_vector_1m()  # max_user=max_user)

    X_train, T_train = X[0:int(0.8 * len(X))], T[0:int(0.8 * len(X))]
    X_test, T_test = X[int(0.8 * len(X)):], T[int(0.8 * len(X)):]

    # print(X)
    # X = Utils.remove_significant_features(X, T)
    # X = feature_selection(X, T, Utils.select_male_female_different)
    # X = Utils.normalize(X)
    # X = Utils.standardize(X)
    # X = chi2_selection(X, T)

    precision = 20
    begin, end = 0, 0.001
    auc_rel = []
    auc_irrel = []
    std_rel = []
    std_irrel = []
    size_r = []
    size_i = []
    for t in np.linspace(begin, end, precision):
        print(X_train.shape)
        X_train_important, X_train_compl = Utils.random_forest_selection(X_train, T_train, threshold=t)
        print(X_train_important.shape)
        size_r.append(X_train_important.shape[1])
        size_i.append(X_train_compl.shape[1])

        mean_auc_r, std_auc_r = Classifiers.log_reg(X_train_important, T_train, show_plot=False)
        mean_auc_i, std_auc_i = Classifiers.log_reg(X_train_compl, T_train, show_plot=False)
        auc_rel.append(mean_auc_r)
        auc_irrel.append(mean_auc_i)
        std_rel.append(std_auc_r)
        std_irrel.append(std_auc_i)

    auc_rel, auc_irrel, std_rel, std_irrel = np.asarray(auc_rel), np.asarray(auc_irrel), np.asarray(
        std_rel), np.asarray(std_irrel)
    plt.subplot(1, 2, 1)

    plt.plot(np.linspace(begin, end, precision), auc_rel, c='b', label='AUC of important features')
    auc_upper = np.minimum(auc_rel + std_rel, 1)
    auc_lower = np.maximum(auc_rel - std_rel, 0)
    plt.fill_between(np.linspace(begin, end, precision), auc_lower, auc_upper, color='grey', alpha=.2)

    plt.plot(np.linspace(begin, end, precision), auc_irrel, c='r', label='AUC of not important features')
    auc_upper = np.minimum(auc_irrel + std_irrel, 1)
    auc_lower = np.maximum(auc_irrel - std_irrel, 0)
    plt.fill_between(np.linspace(begin, end, precision), auc_lower, auc_upper, color='grey', alpha=.2)

    plt.xlabel("Threshold")
    plt.ylabel("Mean AUC")
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(np.linspace(begin, end, precision), size_r, c='b', label='number of important movies')
    plt.plot(np.linspace(begin, end, precision), size_i, c='r', label='number of irrelevant movies')
    plt.xlabel("Threshold")
    plt.ylabel("#samples in data")
    plt.legend()
    plt.show()
コード例 #23
0
def global_stats_ML_obf():
    plt.rcParams.update({'font.size': 28})
    original = MD.load_user_item_matrix_1m()
    blurme = MD.load_user_item_matrix_1m_masked(file_index=71)
    blurmore = MD.load_user_item_matrix_1m_masked(file_index=55)
    ratings = [1, 2, 3, 4, 5]
    rating_distribution = np.zeros(shape=(3,5))
    nr_ratings = np.zeros(shape=(3, blurme.shape[0]))
    rating_distribution_new = np.zeros(shape=(5,))

    """"""
    for user_index in range(blurme.shape[0]):
        for movie_index in range(blurme.shape[1]):
            #if original[user_index, movie_index] != blurme[user_index, movie_index]:
            #    rating_distribution_new[int(blurme[user_index, movie_index]) - 1] += 1

            if original[user_index, movie_index] in ratings:
                nr_ratings[0, user_index] += 1
                rating_distribution[0, int(original[user_index, movie_index])-1] += 1
            if blurme[user_index, movie_index] in ratings:
                nr_ratings[1, user_index] += 1
                rating_distribution[1, int(blurme[user_index, movie_index])-1] += 1
            if blurmore[user_index, movie_index] in ratings:
                nr_ratings[2, user_index] += 1
                rating_distribution[2, int(blurmore[user_index, movie_index])-1] += 1
    f, (ax1, ax2) = plt.subplots(1, 2, sharey=True)
    ax1.bar(range(1, 6), rating_distribution[1,:] - rating_distribution[0, :])
    ax2.bar(range(1, 6), rating_distribution[2, :] - rating_distribution[0, :])
    plt.show()
    print("new ratings:", rating_distribution)
    f, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=True)

    rating_distribution[0, :] /= sum(rating_distribution[0,:])
    rating_distribution[1, :] /= sum(rating_distribution[1,:])
    rating_distribution[2, :] /= sum(rating_distribution[2,:])
    print(rating_distribution)
    nr_ratings[0, :] = list(reversed(sorted(nr_ratings[0, :])))
    nr_ratings[1, :] = list(reversed(sorted(nr_ratings[1, :])))
    nr_ratings[2, :] = list(reversed(sorted(nr_ratings[2, :])))
    print(nr_ratings)
    ax1.bar(range(1, 6), rating_distribution[0, :])
    ax1.set_xlabel("Rating")
    ax1.set_ylabel("Rating Frequency in %")
    ax1.set_title("Rating frequency \noriginal ML")
    ax2.bar(range(1, 6), rating_distribution[1, :])
    ax2.set_xlabel("Rating")
    #ax2.set_ylabel("Rating Frequency")
    ax2.set_title("Rating frequency \nBlurMe")
    ax3.bar(range(1, 6), rating_distribution[2, :])
    ax3.set_xlabel("Rating")
    #ax3.set_ylabel("Rating Frequency")
    ax3.set_title("Rating frequency \nBlurM(or)e")
    plt.show()

    f, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=True)
    ax1.bar(range(100), nr_ratings[0, 0:100])
    ax2.bar(range(100), nr_ratings[1, 0:100])
    ax3.bar(range(100), nr_ratings[2, 0:100])
    plt.show()

    f, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=True)
    ax1.plot(range(300), nr_ratings[0, -300:])
    ax2.plot(range(300), nr_ratings[1, -300:])
    ax3.plot(range(300), nr_ratings[2, -300:])
    plt.show()
コード例 #24
0
def blurMe_1m():
    sample_mode = list(['random', 'sampled', 'greedy'])[2]
    rating_mode = list(['highest', 'avg', 'pred'])[1]
    top = -1
    p = 0.01
    dataset = ['ML', 'Fx', 'Li'][0]
    if dataset == 'ML':
        X = MD.load_user_item_matrix_1m()  # max_user=max_user, max_item=max_item)
        T = MD.load_gender_vector_1m()  # max_user=max_user)
    elif dataset == 'Fx':
        import FlixsterData as FD
        X, T, _ = FD.load_flixster_data_subset()
    else:
        import LibimSeTiData as LD
        X, T, _ = LD.load_libimseti_data_subset()
    #X = Utils.normalize(X)

    avg_ratings = np.zeros(shape=X.shape[0])
    for index, user in enumerate(X):
        ratings = []
        for rating in user:
            if rating > 0:
                ratings.append(rating)
        if len(ratings) == 0:
            avg_ratings[index] = 0
        else:
            avg_ratings[index] = np.average(ratings)

    """ AVERAGE ACROSS MOVIE
    avg_ratings = np.zeros(shape=X.shape[1])
    for item_id in range(X.shape[1]):
        ratings = []
        for rating in X[:, item_id]:
            if rating > 0:
                ratings.append(rating)
        if len(ratings) == 0:
            avg_ratings[item_id] = 0
        else:
            avg_ratings[item_id] = np.average(ratings)
    """


    # 1: get the set of most correlated movies, L_f and L_m:
    X_train, T_train = X[0:int(0.8 * len(X))], T[0:int(0.8 * len(X))]
    X_test, T_test = X[int(0.8 * len(X)):], T[int(0.8 * len(X)):]
    print("lists")
    from sklearn.model_selection import StratifiedKFold
    from sklearn.linear_model import LogisticRegression

    cv = StratifiedKFold(n_splits=10)
    coefs = []
    avg_coefs = np.zeros(shape=(len(X_train[1]),))

    random_state = np.random.RandomState(0)
    for train, test in cv.split(X_train, T_train):
        x, t = X_train[train], T_train[train]
        model = LogisticRegression(penalty='l2', random_state=random_state)
        model.fit(x, t)
        # rank the coefs:
        ranks = ss.rankdata(model.coef_[0])
        coefs.append(ranks)
        #print(len(model.coef_[0]),len(X_train[0]))
        avg_coefs += model.coef_[0]

    coefs = np.average(coefs, axis=0)
    coefs = [[coefs[i], i+1, avg_coefs[i]] for i in range(len(coefs))]
    coefs = np.asarray(list(sorted(coefs)))
    if top == -1:
        values = coefs[:,2]
        index_zero = np.where(values == np.min(np.abs(values)))
        top_male = index_zero[0][0]
        top_female = index_zero[0][-1]
        L_m = coefs[:top_male, 1]
        R_m = 3952 - coefs[:top_male, 0]
        C_m = np.abs(coefs[:top_male, 2])
        L_f = coefs[coefs.shape[0] - top_female:, 1]
        L_f = list(reversed(L_f))
        R_f = coefs[coefs.shape[0] - top_female:, 0]
        R_f = list(reversed(R_f))
        C_f = coefs[coefs.shape[0] - top_female:, 2]
        C_f = list(reversed(np.abs(C_f)))

    else:
        L_m = coefs[:top, 1]
        R_m = 3952-coefs[:top, 0]
        C_m = np.abs(coefs[:top, 2])
        L_f = coefs[coefs.shape[0]-top:, 1]
        L_f = list(reversed(L_f))
        R_f = coefs[coefs.shape[0]-top:, 0]
        R_f = list(reversed(R_f))
        C_f = coefs[coefs.shape[0]-top:, 2]
        C_f = list(reversed(np.abs(C_f)))

    #print(R_f)

    """
    id_index, index_id = MD.load_movie_id_index_dict()
    movies = []
    with open("ml-1m/movies.dat", 'r') as f:
        for line in f.readlines():
            movies.append(line.replace("\n", ""))

    for index, val in enumerate(L_m[0:10]):
        print(index, movies[id_index[int(val)]], C_m[index])
    for index, val in enumerate(L_f[0:10]):
        print(index, movies[id_index[int(val)]], C_f[index])

    
    movie_dict = MD.load_movie_id_dictionary_1m()
    print("males")
    for id in L_m:
        print(movie_dict[int(id)])

    print("females")
    for id in L_f:
        print(movie_dict[int(id)])
    """
    print("obfuscation")
    # Now, where we have the two lists, we can start obfuscating the data:
    #X = MD.load_user_item_matrix_1m()
    X_obf = np.copy(X)

    #X = Utils.normalize(X)
    #X_obf = Utils.normalize(X_obf)
    prob_m = []#[p / sum(C_m) for p in C_m]
    prob_f = []#[p / sum(C_f) for p in C_f]
    print("obfuscation")
    for index, user in enumerate(X):
        print(index)
        k = 0
        for rating in user:
            if rating > 0:
                k += 1
        k *= p
        greedy_index = 0
        #print(k)
        if T[index] == 1:
            added = 0
            safety_counter = 0
            while added < k and safety_counter < 100:
                # select a random movie:
                if sample_mode == 'random':
                    movie_id = L_m[np.random.randint(0, len(L_m))]
                elif sample_mode == 'sampled':
                    movie_id = L_m[np.random.choice(range(len(L_m)), p=prob_m)]
                elif sample_mode == 'greedy':
                    movie_id = L_m[greedy_index]
                    greedy_index += 1
                    if greedy_index >= len(L_m):
                        safety_counter = 100
                if X_obf[index, int(movie_id)-1] == 0:
                    if rating_mode == 'higest':
                        X_obf[index, int(movie_id) - 1] = 5
                    elif rating_mode == 'avg':
                        X_obf[index, int(movie_id) - 1] = avg_ratings[int(index)]
                    added += 1
                safety_counter += 1
        elif T[index] == 0:
            added = 0
            safety_counter = 0
            while added < k and safety_counter < 100:
                # select a random movie:
                if sample_mode == 'random':
                    movie_id = L_f[np.random.randint(0, len(L_f))]
                elif sample_mode == 'sampled':
                    movie_id = L_f[np.random.choice(range(len(L_f)), p=prob_f)]
                elif sample_mode == 'greedy':
                    movie_id = L_f[greedy_index]
                    greedy_index += 1
                    if greedy_index >= len(L_f):
                        safety_counter = 100
                if X_obf[index, int(movie_id) - 1] == 0:
                    if rating_mode == 'higest':
                        X_obf[index, int(movie_id) - 1] = 5
                    elif rating_mode == 'avg':
                        X_obf[index, int(movie_id) - 1] = avg_ratings[int(index)]
                    added += 1
                safety_counter += 1

    # output the data in a file:
    output_file = ""
    if dataset == 'ML':
        output_file = "ml-1m/"
        with open(output_file + "blurme_obfuscated_" + str(p) + "_" + sample_mode + "_" + rating_mode + "_top" + str(
                top) + ".dat", 'w') as f:
            for index_user, user in enumerate(X_obf):
                for index_movie, rating in enumerate(user):
                    if rating > 0:
                        f.write(str(index_user + 1) + "::" + str(index_movie + 1) + "::" + str(
                            int(np.round(rating))) + "::000000000\n")

    elif dataset == 'Fx':
        import FlixsterData as FD
        output_file = "Flixster/"
        user_id2index, user_index2id = FD.load_user_id_index_dict()
        movie_id2index, movie_index2id = FD.load_movie_id_index_dict()

        with open(output_file + "FX_blurme_obfuscated_" + str(p) + "_" + sample_mode + "_" + rating_mode + "_top" + str(
                top) + ".dat", 'w') as f:
            for index_user, user in enumerate(X_obf):
                for index_movie, rating in enumerate(user):
                    if rating > 0:
                        f.write(str(user_index2id[index_user]) + "::" + str(movie_index2id[index_movie]) + "::" + str(
                            int(np.round(rating))) + "::000000000\n")

    else:
        with open("libimseti/LST_blurme_obfuscated_" + str(p) + "_" + sample_mode + "_" + rating_mode + "_top" + str(
                top) + ".dat", 'w') as f:
            for index_user, user in enumerate(X_obf):
                for index_movie, rating in enumerate(user):
                    if rating > 0:
                        f.write(str(index_user+1) + "::" + str(index_movie+1) + "::" + str(
                            int(np.round(rating))) + "::000000000\n")

    return X_obf
コード例 #25
0
def blurMePP():
    top = -1
    sample_mode = list(['random', 'sampled', 'greedy'])[2]
    id_index, index_id = MD.load_movie_id_index_dict()
    notice_factor = 2
    p = 0.1
    dataset = ['ML', 'Fx', 'Li'][2]
    if dataset == 'ML':
        X = MD.load_user_item_matrix_1m()  # max_user=max_user, max_item=max_item)
        T = MD.load_gender_vector_1m()  # max_user=max_user)
    elif dataset == 'Fx':
        import FlixsterData as FD
        X, T, _ = FD.load_flixster_data_subset()
    else:
        import LibimSeTiData as LD
        X, T, _ = LD.load_libimseti_data_subset()
    # X = Utils.normalize(X)
    avg_ratings = np.zeros(shape=X.shape[1])
    initial_count = np.zeros(shape=X.shape[1])
    for item_id in range(X.shape[1]):
        ratings = []
        for rating in X[:, item_id]:
            if rating > 0:
                ratings.append(rating)
        if len(ratings) == 0:
            avg_ratings[item_id] = 0
        else:
            avg_ratings[item_id] = np.average(ratings)
        initial_count[item_id] = len(ratings)
    max_count = initial_count * notice_factor
    # 1: get the set of most correlated movies, L_f and L_m:
    from sklearn.model_selection import StratifiedKFold
    from sklearn.linear_model import LogisticRegression

    cv = StratifiedKFold(n_splits=10)
    coefs = []
    avg_coefs = np.zeros(shape=(len(X[1]),))

    random_state = np.random.RandomState(0)
    for train, test in cv.split(X, T):
        x, t = X[train], T[train]
        model = LogisticRegression(penalty='l2', random_state=random_state)
        model.fit(x, t)
        # rank the coefs:
        ranks = ss.rankdata(model.coef_[0])
        coefs.append(ranks)
        # print(len(model.coef_[0]),len(X_train[0]))
        avg_coefs += model.coef_[0]

    coefs = np.average(coefs, axis=0)
    coefs = [[coefs[i], i + 1, avg_coefs[i]] for i in range(len(coefs))]
    coefs = np.asarray(list(sorted(coefs)))
    if top == -1:
        values = coefs[:,2]
        index_zero = np.where(values == np.min(np.abs(values)))
        top_male = index_zero[0][0]
        top_female = index_zero[0][-1]
        L_m = coefs[:top_male, 1]
        R_m = 3952 - coefs[:top_male, 0]
        C_m = np.abs(coefs[:top_male, 2])
        L_f = coefs[coefs.shape[0] - top_female:, 1]
        L_f = list(reversed(L_f))
        R_f = coefs[coefs.shape[0] - top_female:, 0]
        R_f = list(reversed(R_f))
        C_f = coefs[coefs.shape[0] - top_female:, 2]
        C_f = list(reversed(np.abs(C_f)))

    else:
        L_m = coefs[:top, 1]
        R_m = 3952-coefs[:top, 0]
        C_m = np.abs(coefs[:top, 2])
        L_f = coefs[coefs.shape[0]-top:, 1]
        L_f = list(reversed(L_f))
        R_f = coefs[coefs.shape[0]-top:, 0]
        R_f = list(reversed(R_f))
        C_f = coefs[coefs.shape[0]-top:, 2]
        C_f = list(reversed(np.abs(C_f)))

    # Now, where we have the two lists, we can start obfuscating the data:
    #X = MD.load_user_item_matrix_1m()
    #np.random.shuffle(X)
    #print(X.shape)
    X_obf = np.copy(X)
    total_added = 0
    for index, user in enumerate(X):
        print(index)
        k = 0
        for rating in user:
            if rating > 0:
                k += 1
        k *= p
        greedy_index_m = 0
        greedy_index_f = 0
        # print(k)
        added = 0
        if T[index] == 1:
            safety_counter = 0
            while added < k and safety_counter < 1000:
                if greedy_index_m >= len(L_m):
                    safety_counter = 1000
                    continue
                if sample_mode == 'greedy':
                    movie_id = L_m[greedy_index_m]
                if sample_mode == 'random':
                    movie_id = L_m[np.random.randint(0, len(L_m))]
                greedy_index_m += 1
                rating_count = sum([1 if x > 0 else 0 for x in X_obf[:, int(movie_id)-1]])
                if rating_count > max_count[int(movie_id)-1]:
                    continue
                if X_obf[index, int(movie_id) - 1] == 0:
                    X_obf[index, int(movie_id) - 1] = avg_ratings[int(movie_id) - 1]
                    added += 1
                safety_counter += 1
        elif T[index] == 0:
            safety_counter = 0
            while added < k and safety_counter < 1000:
                if greedy_index_f >= len(L_f):
                    safety_counter = 1000
                    continue
                if sample_mode == 'greedy':
                    movie_id = L_f[greedy_index_f]
                if sample_mode == 'random':
                    movie_id = L_f[np.random.randint(0, len(L_f))]
                greedy_index_f += 1
                rating_count = sum([1 if x > 0 else 0 for x in X_obf[:, int(movie_id) - 1]])
                if rating_count > max_count[int(movie_id) - 1]:
                    continue

                if X_obf[index, int(movie_id) - 1] == 0:
                    X_obf[index, int(movie_id) - 1] = avg_ratings[int(movie_id) - 1]
                    added += 1
                safety_counter += 1
        total_added += added

    # Now remove ratings from users that have more than 200 ratings equally:
    nr_many_ratings = 0
    for user in X:
        rating_count = sum([1 if x > 0 else 0 for x in user])
        if rating_count > 200:
            nr_many_ratings += 1
    print(nr_many_ratings)
    nr_remove = total_added/nr_many_ratings

    for user_index, user in enumerate(X):
        rating_count = sum([1 if x > 0 else 0 for x in user])
        if rating_count > 200:
            to_be_removed_indecies = np.random.choice(np.argwhere(user > 0)[:,0], size=(int(nr_remove),), replace=False)
            X_obf[user_index, to_be_removed_indecies] = 0

    # finally, shuffle the user vectors:
    #np.random.shuffle(X_obf)
    # output the data in a file:
    output_file = ""
    if dataset == 'ML':
        output_file = "ml-1m/"
        with open(output_file + "blurmepp_obfuscated_" + sample_mode + "_" + str(p) + "_" + str(notice_factor) + ".dat",
                  'w') as f:
            for index_user, user in enumerate(X_obf):
                for index_movie, rating in enumerate(user):
                    if rating > 0:
                        f.write(
                            str(index_user + 1) + "::" + str(index_movie + 1) + "::" + str(
                                int(np.round(rating))) + "::000000000\n")

    elif dataset == 'Fx':
        import FlixsterData as FD
        output_file = "Flixster/"
        user_id2index, user_index2id = FD.load_user_id_index_dict()
        movie_id2index, movie_index2id = FD.load_movie_id_index_dict()

        with open(output_file + "FX_blurmepp_obfuscated_" + sample_mode + "_" + str(p) + "_" + str(notice_factor) + ".dat",
                  'w') as f:
            for index_user, user in enumerate(X_obf):
                for index_movie, rating in enumerate(user):
                    if rating > 0:
                        f.write(str(user_index2id[index_user]) + "::" + str(movie_index2id[index_movie]) + "::" + str(
                            int(np.round(rating))) + "::000000000\n")

    else:
        with open("libimseti/LST_blurmepp_obfuscated_" + sample_mode + "_" + str(p) + "_" + str(notice_factor) + ".dat", 'w') as f:
            for index_user, user in enumerate(X_obf):
                for index_movie, rating in enumerate(user):
                    if rating > 0:
                        f.write(str(index_user+1) + "::" + str(index_movie+1) + "::" + str(
                            int(np.round(rating))) + "::000000000\n")


    return X_obf
コード例 #26
0
def blurMeBetter():
    top = -1
    sample_mode = list(['random', 'sampled', 'greedy'])[2]
    p = 0.05
    id_index, index_id = MD.load_movie_id_index_dict()
    notice_factor = 2
    certainty_threshold = 0.8
    dataset = ['ML', 'Fx', 'Li'][0]
    if dataset == 'ML':
        X = MD.load_user_item_matrix_1m()  # max_user=max_user, max_item=max_item)
        T = MD.load_gender_vector_1m()  # max_user=max_user)
    elif dataset == 'Fx':
        import FlixsterData as FD
        X, T, _ = FD.load_flixster_data_subset()
    else:
        import LibimSeTiData as LD
        X, T, _ = LD.load_libimseti_data_subset()
    # X = Utils.normalize(X)
    avg_ratings = np.zeros(shape=X.shape[1])
    initial_count = np.zeros(shape=X.shape[1])
    for item_id in range(X.shape[1]):
        ratings = []
        for rating in X[:, item_id]:
            if rating > 0:
                ratings.append(rating)
        if len(ratings) == 0:
            avg_ratings[item_id] = 0
        else:
            avg_ratings[item_id] = np.average(ratings)
        initial_count[item_id] = len(ratings)
    max_count = initial_count * notice_factor
    # 1: get the set of most correlated movies, L_f and L_m:
    #X_train, T_train = X[0:int(0.8 * len(X))], T[0:int(0.8 * len(X))]
    #X_test, T_test = X[int(0.8 * len(X)):], T[int(0.8 * len(X)):]

    from sklearn.model_selection import StratifiedKFold
    from sklearn.linear_model import LogisticRegression

    cv = StratifiedKFold(n_splits=10)
    coefs = []
    avg_coefs = np.zeros(shape=(len(X[1]),))

    certainty = np.zeros(shape=(len(X),))
    random_state = np.random.RandomState(0)
    for train, test in cv.split(X, T):
        x, t = X[train], T[train]
        model = LogisticRegression(penalty='l2', random_state=random_state)
        model.fit(x, t)
        # rank the coefs:
        ranks = ss.rankdata(model.coef_[0])
        coefs.append(ranks)
        # print(len(model.coef_[0]),len(X_train[0]))
        avg_coefs += model.coef_[0]
        x_test = X[test]

        class_prob = np.max(model.predict_proba(x_test),axis=1)
        #correct, so that 1 means the classifier is very sure and 0 means it is not sure
        class_prob -= 0.5
        class_prob *= 2
        certainty[test] = class_prob
        # set certainty to 0 for all missclassifications:
        t_pred = model.predict(x_test)
        t_test = T[test]
        for index, (pred, target) in enumerate(zip(t_pred, t_test)):
            #print(pred, target, index, test)
            if pred != target:
                certainty[test[index]] = 0

    """ plot certainty scores
    print("-------------------------")
    import matplotlib.pyplot as plt
    plt.bar(range(0,50), certainty[0:50])
    plt.xlabel("user")
    plt.ylabel("certainty score")
    plt.show()
    """
    coefs = np.average(coefs, axis=0)
    coefs = [[coefs[i], i + 1, avg_coefs[i]] for i in range(len(coefs))]
    coefs = np.asarray(list(sorted(coefs)))
    if top == -1:
        values = coefs[:, 2]
        index_zero = np.where(values == np.min(np.abs(values)))
        top_male = index_zero[0][0]
        top_female = index_zero[0][-1]
        L_m = coefs[:top_male, 1]
        R_m = 3952 - coefs[:top_male, 0]
        C_m = np.abs(coefs[:top_male, 2])
        L_f = coefs[coefs.shape[0] - top_female:, 1]
        L_f = list(reversed(L_f))
        R_f = coefs[coefs.shape[0] - top_female:, 0]
        R_f = list(reversed(R_f))
        C_f = coefs[coefs.shape[0] - top_female:, 2]
        C_f = list(reversed(np.abs(C_f)))

    else:
        L_m = coefs[:top, 1]
        R_m = 3952 - coefs[:top, 0]
        C_m = np.abs(coefs[:top, 2])
        L_f = coefs[coefs.shape[0] - top:, 1]
        L_f = list(reversed(L_f))
        R_f = coefs[coefs.shape[0] - top:, 0]
        R_f = list(reversed(R_f))
        C_f = coefs[coefs.shape[0] - top:, 2]
        C_f = list(reversed(np.abs(C_f)))

    # Now, where we have the two lists, we can start obfuscating the data:
    #X = MD.load_user_item_matrix_1m()
    # np.random.shuffle(X)
    X_obf = np.copy(X)
    total_added = 0
    nr_skipped_users= 0
    for index, user in enumerate(X):
        if certainty[index] < certainty_threshold:
            nr_skipped_users+=1
            print(index, nr_skipped_users)
            continue
        k = 0
        for rating in user:
            if rating > 0:
                k += 1
        k *= p
        greedy_index = 0
        # print(k)
        added = 0
        if T[index] == 1:
            safety_counter = 0
            while added < k and safety_counter < 1000:
                if greedy_index >= len(L_m):
                    safety_counter = 1000
                    continue
                if sample_mode == 'greedy':
                    movie_id = L_m[greedy_index]
                if sample_mode == 'random':
                    movie_id = L_m[np.random.randint(0, len(L_m))]
                greedy_index += 1
                rating_count = sum([1 if x > 0 else 0 for x in X_obf[:, int(movie_id) - 1]])
                if rating_count > max_count[int(movie_id) - 1]:
                    continue
                if X_obf[index, int(movie_id) - 1] == 0:
                    X_obf[index, int(movie_id) - 1] = avg_ratings[int(movie_id) - 1]
                    added += 1
                safety_counter += 1
        elif T[index] == 0:
            safety_counter = 0
            while added < k and safety_counter < 1000:
                if greedy_index >= len(L_f):
                    safety_counter = 1000
                    continue
                if sample_mode == 'greedy':
                    movie_id = L_f[greedy_index]
                if sample_mode == 'random':
                    movie_id = L_f[np.random.randint(0, len(L_f))]
                greedy_index += 1
                rating_count = sum([1 if x > 0 else 0 for x in X_obf[:, int(movie_id) - 1]])
                if rating_count > max_count[int(movie_id) - 1]:
                    continue

                if X_obf[index, int(movie_id) - 1] == 0:
                    X_obf[index, int(movie_id) - 1] = avg_ratings[int(movie_id) - 1]
                    added += 1
                safety_counter += 1
        total_added += added
    print("nr of skipped users:", nr_skipped_users)
    # Now remove ratings from users that have more than 200 ratings equally:
    nr_many_ratings = 0
    for user in X:
        rating_count = sum([1 if x > 0 else 0 for x in user])
        if rating_count > 200:
            nr_many_ratings += 1
    nr_remove = total_added / nr_many_ratings

    for user_index, user in enumerate(X):
        rating_count = sum([1 if x > 0 else 0 for x in user])
        if rating_count > 200:
            to_be_removed_indecies = np.random.choice(np.argwhere(user > 0)[:, 0], size=(int(nr_remove),),
                                                      replace=False)
            X_obf[user_index, to_be_removed_indecies] = 0

    # finally, shuffle the user vectors:
    # np.random.shuffle(X_obf)
    # output the data in a file:
    output_file = ""
    if dataset == 'ML':
        output_file = "ml-1m/"
        with open(output_file + "blurmebetter_obfuscated_" + sample_mode + "_" + str(p) + "_" + str(
                notice_factor) + "_c" + str(certainty_threshold) + ".dat", 'w') as f:
            for index_user, user in enumerate(X_obf):
                for index_movie, rating in enumerate(user):
                    if rating > 0:
                        f.write(
                            str(index_user + 1) + "::" + str(index_movie + 1) + "::" + str(
                                int(np.round(rating))) + "::000000000\n")
    elif dataset == 'Fx':
        import FlixsterData as FD
        output_file = "Flixster/"
        user_id2index, user_index2id = FD.load_user_id_index_dict()
        movie_id2index, movie_index2id = FD.load_movie_id_index_dict()

        with open(output_file + "FX_blurmebetter_obfuscated_" + sample_mode + "_" + str(p) + "_" + str(
                notice_factor) + "_c" + str(certainty_threshold) + ".dat",
                  'w') as f:
            for index_user, user in enumerate(X_obf):
                for index_movie, rating in enumerate(user):
                    if rating > 0:
                        f.write(str(user_index2id[index_user]) + "::" + str(movie_index2id[index_movie]) + "::" + str(
                            int(np.round(rating))) + "::000000000\n")
    else:
        with open("libimseti/LST_blurmebetter_obfuscated_" + sample_mode + "_" + str(p) + "_" + str(
                notice_factor) + "_c" + str(certainty_threshold) + ".dat", 'w') as f:
            for index_user, user in enumerate(X_obf):
                for index_movie, rating in enumerate(user):
                    if rating > 0:
                        f.write(str(index_user+1) + "::" + str(index_movie+1) + "::" + str(
                            int(np.round(rating))) + "::000000000\n")

    return X_obf