def rating_add_1m(): # add a percentage of random ratings to a user: X = MD.load_user_item_matrix_1m() X_obf = MD.load_user_item_matrix_1m() percentage = 0.05 for user_index, user in enumerate(X): nr_ratings = 0 for rating in user: if rating > 0: nr_ratings += 1 added = 0 safety_counter = 0 while added < nr_ratings*percentage and safety_counter < 100: index = np.random.randint(0,len(user)) if X_obf[user_index, index] > 0: safety_counter += 1 continue else: X_obf[user_index, index] = np.random.randint(1,6) # output the data in a file: with open("ml-1m/random_added_obfuscated_" + str(percentage) + ".dat", 'w') as f: for index_user, user in enumerate(X_obf): for index_movie, rating in enumerate(user): if rating > 0: f.write(str(index_user + 1) + "::" + str(index_movie + 1) + "::" + str( int(rating)) + "::000000000\n") return X_obf
def comp_BM_and_BMpp(): plt.rcParams.update({'font.size': 28}) f, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=True) interval_start, interval_end = 0, 50 X = MD.load_user_item_matrix_1m() X_movie_count = [sum([1 if x > 0 else 0 for x in movie]) for movie in np.transpose(X)[interval_start:interval_end]] movie_count_o = np.asarray(X_movie_count) ax1.bar(range(interval_start,interval_end), X_movie_count) ax1.set_title("(A)\nOriginal data") ax1.set_xlabel("movie ID") ax1.set_ylabel("#ratings") #ax1.set_xticks(range(1,6), [1,2,3,4,5]) print("Original Data:", sum(X_movie_count)) X = MD.load_user_item_matrix_1m_masked(file_index=63) X_movie_count = [sum([1 if x > 0 else 0 for x in movie]) for movie in np.transpose(X)[interval_start:interval_end]] masked_count = np.asarray(X_movie_count) ax2.bar(range(interval_start, interval_end), X_movie_count) ax2.set_title("(B)\nBlurMe data") ax2.set_xlabel("movie ID") ax2.set_ylabel("#ratings") print("BlurMe Data:", sum(X_movie_count)) X = MD.load_user_item_matrix_1m_masked(file_index=75) X_movie_count = [sum([1 if x > 0 else 0 for x in movie]) for movie in np.transpose(X)[interval_start:interval_end]] masked_count2 = np.asarray(X_movie_count) ax3.bar(range(interval_start, interval_end), X_movie_count) ax3.set_title("(C)\nBlurM(or)e data") ax3.set_xlabel("movie ID") ax3.set_ylabel("#ratings") print("BlurMe++ Data:", sum(X_movie_count)) print(movie_count_o-masked_count) print(masked_count-masked_count2) plt.show()
def load_real_fake_data_ML_1m(file_index=24): data = [] real = MD.load_user_item_matrix_1m() #real = MD.load_user_item_matrix_100k() #real = load_user_item_matrix_1m_masked(file_index=17) real = real[0:int(real.shape[0] / 2), :] #fake = load_user_item_matrix_100k() #fake = simulate_data(real.shape) fake = load_user_item_matrix_1m_masked(file_index=file_index) #fake = MD.load_user_item_matrix_100k_masked(file_index=1) fake = fake[int(fake.shape[0] / 2):, :] #fake = real #fake = np.random.randint(5, size=real.shape) #print(fake) data = np.zeros(shape=(real.shape[0] + fake.shape[0], real[0].shape[0])) labels = np.zeros(shape=(real.shape[0] + fake.shape[0], )) for user_index, user in enumerate(real): data[user_index, :] = user labels[user_index] = 1 for user_index, user in enumerate(fake): data[len(real) + user_index, :] = user labels[len(real) + user_index] = 0 from Utils import shuffle_two_arrays data, labels = shuffle_two_arrays(data, labels) return data, labels
def one_million(classifier): max_user = 6040 max_item = 3952 X = MD.load_user_item_matrix_1m() # max_user=max_user, max_item=max_item) T = MD.load_user_genre_matrix_1m(one_hot=True, top=1) T = np.argwhere(T == 1)[:, 1] print(min(T), max(T)) """ Note that we loose class 13 (Romance. it seems that no one has romance as favourite genre. This kinda makes sense because it correlates so much with drama and comedy. """ import collections import matplotlib.pyplot as plt counter = collections.Counter(T) #plt.bar(counter.keys(), counter.values()) #plt.xlabel("T") #plt.ylabel('frequency') #plt.show() print(counter) X = Utils.normalize(X) #print(T) #X = MD.feature_selection(X, T, f_regression) #X = MD.chi2_selection(X, T) classifier(X, T, multiclass=True, nr_classes=17)
def compare_real_fake(): import RealFakeData as RFD real = MD.load_user_item_matrix_1m() real = real[0:40, 0:40] # fake = load_user_item_matrix_100k() # fake = simulate_data(real.shape) fake_bm = RFD.load_user_item_matrix_1m_masked(file_index=12) fake_bm = fake_bm[0:40, 0:40] fake_bmpp = RFD.load_user_item_matrix_1m_masked(file_index=17) fake_bmpp = fake_bmpp[00:40, 0:40] print(fake_bmpp.shape) plt.subplot(3,3,1) plt.imshow(real) plt.title("real") plt.subplot(3,3,4) plt.imshow(fake_bm) plt.title("fake_bm") plt.subplot(3, 3, 7) plt.imshow(fake_bmpp) plt.title("fake_bmpp") plt.subplot(3, 3, 5) plt.imshow(real-fake_bm) plt.title("real-fake_bm") plt.subplot(3, 3, 8) plt.imshow(real - fake_bmpp) plt.title("real-fake_bmpp") plt.show()
def one_million_obfuscated(classifier): #X2 = MD.load_user_item_matrix_1m() # max_user=max_user, max_item=max_item) T = MD.load_gender_vector_1m() # max_user=max_user) X1 = MD.load_user_item_matrix_1m() X2 = MD.load_user_item_matrix_1m_masked( file_index=55) # max_user=max_user, max_item=max_item) #X2 = X1 print(X1.shape, X2.shape) #X1, T = Utils.balance_data(X1, T) #X2, T2 = Utils.balance_data(X2, T) #X1 = Utils.normalize(X1) #X2 = Utils.normalize(X2) X_train, T_train = X1[0:int(0.8 * len(X1))], T[0:int(0.8 * len(X1))] X_test, T_test = X2[int(0.8 * len(X2)):], T[int(0.8 * len(X2)):] print(list(X1[0, :])) print(list(X2[0, :])) # print(X) print("before", X_train.shape) # X = Utils.remove_significant_features(X, T) # X_train, _ = Utils.random_forest_selection(X_train, T_train) # X = feature_selection(X, T, Utils.select_male_female_different) print(X_train.shape) from sklearn.linear_model import LogisticRegression random_state = np.random.RandomState(0) model = LogisticRegression(penalty='l2', random_state=random_state) Utils.ROC_cv_obf(X1, X2, T, model) model = LogisticRegression(penalty='l2', random_state=random_state)
def one_million(classifier): X = MD.load_user_item_matrix_1m() # max_user=max_user, max_item=max_item) #X = MD.load_user_item_matrix_1m_limited_ratings(limit=1) #X = MD.load_user_item_matrix_1m_binary() # X = MD.load_user_genre_matrix_100k_obfuscated() T = MD.load_gender_vector_1m() # max_user=max_user) #X, T = Utils.balance_data(X, T) #X = Utils.normalize(X) X = feature_selection(X, T, Utils.select_male_female_different) X_train, T_train = X[0:int(0.8 * len(X))], T[0:int(0.8 * len(X))] X_test, T_test = X[int(0.8 * len(X)):], T[int(0.8 * len(X)):] # print(X) print("before", X_train.shape) # X = Utils.remove_significant_features(X, T) #X_train, _ = Utils.random_forest_selection(X_train, T_train) # X = feature_selection(X, T, Utils.select_male_female_different) print(X_train.shape) # X = Utils.normalize(X) # X = Utils.standardize(X) # X = chi2_selection(X, T) classifier(X_train, T_train) from sklearn.linear_model import LogisticRegression random_state = np.random.RandomState(0) #model = Models.Dominant_Class_Classifier() model = LogisticRegression(penalty='l2', random_state=random_state) model.fit(X_train, T_train) Utils.ROC_plot(X_test, T_test, model)
def feature_importance_1m(): plt.rcParams.update({'font.size': 18}) from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression import seaborn as sns X = MD.load_user_item_matrix_1m() T = MD.load_gender_vector_1m() importance = np.zeros(shape=(X.shape[1],)) importance2 = np.zeros(shape=(X.shape[1],)) for i in range(10): model = LogisticRegression() model2 = RandomForestClassifier() model.fit(X, T) model2.fit(X,T) importance += model.coef_[0] importance2 += model2.feature_importances_ importance /= 10 importance2 /= 10 #plt.bar(range(1,len(importance[0:30])+1), importance[0:30]) #plt.xlabel("movie index") #plt.ylabel("importance") #plt.show() #sns.distplot(importance, kde=False) plt.hist(importance2, bins=np.linspace(0,0.001,50)) #sns.kdeplot(importance,shade=True,cut=0) #sns.rugplot(importance) plt.xlabel("importance") plt.ylabel("frequency") plt.title("Importance of movies distribution") plt.show() importance_id = zip(importance, range(1,len(importance)+1)) importance_id = list(reversed(sorted(importance_id))) importance_id2 = zip(importance, range(1, len(importance) + 1)) importance_id2 = list(sorted(importance_id2)) importance_id3 = zip(importance2, range(1, len(importance2) + 1)) importance_id3 = list(reversed(sorted(importance_id3))) set1 = set() set2 = set() set3 = set() names = MD.load_movie_id_dictionary_1m() top = 100 for (_, id), (_,id2), (_,id3) in zip(importance_id[0:top], importance_id2[0:top], importance_id3[0:top]): print(names[id], "|", names[id2], "|", names[id3]) set1.add(names[id]) set2.add(names[id2]) set3.add(names[id3]) #print(set3) print(set3.intersection(set2.union(set1))) #print(importance_id) """
def rating_distr(): T = MD.load_gender_vector_1m() X = MD.load_user_item_matrix_1m() f, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=True) frequencies = np.zeros(shape=(6,)) ratings = [] movie_ids = [] for user in X: for index, rating in enumerate(user): frequencies[int(rating)] += 1 if rating > 0: movie_ids.append(index+1) ratings.append(rating) print(frequencies) print(sum(frequencies[1:]), np.mean(frequencies[1:]), np.var(frequencies[1:])) print("mean:", np.dot(np.arange(0, 6), frequencies) / sum(frequencies), "without 0:", np.dot(np.arange(1, 6), frequencies[1:]) / sum(frequencies[1:])) print("Avg", np.average(ratings), "var", np.var(ratings)) print(len(set(movie_ids))) ax1.bar(range(5), frequencies[1:]) ax1.set_xlabel("Original") X = MD.load_user_item_matrix_1m_masked(file_index=71)# greedy 10% frequencies = np.zeros(shape=(6,)) ratings = [] for user in X: for rating in user: frequencies[int(rating)] += 1 if rating > 0: ratings.append(rating) print(frequencies) print(sum(frequencies[1:]), np.mean(frequencies[1:]), np.var(frequencies[1:])) print("mean:", np.dot(np.arange(0, 6), frequencies) / sum(frequencies), "without 0:", np.dot(np.arange(1, 6), frequencies[1:]) / sum(frequencies[1:])) print("Avg", np.average(ratings), "var", np.var(ratings)) ax2.bar(range(5), frequencies[1:]) ax2.set_xlabel("BlurMe") X = MD.load_user_item_matrix_1m_masked(file_index=55)# BlurMe++ 10%, fac=2 frequencies = np.zeros(shape=(6,)) ratings = [] for user in X: for rating in user: frequencies[int(rating)] += 1 if rating > 0: ratings.append(rating) print(frequencies) print(sum(frequencies[1:]), np.mean(frequencies[1:]), np.var(frequencies[1:])) print("mean:", np.dot(np.arange(0, 6), frequencies) / sum(frequencies), "without 0:", np.dot(np.arange(1, 6), frequencies[1:]) / sum(frequencies[1:])) print("Avg", np.average(ratings), "var", np.var(ratings)) ax3.bar(range(5), frequencies[1:]) ax3.set_xlabel("BlurMe++") plt.show()
def feature_importance_1m(): from sklearn.ensemble import RandomForestClassifier X = MD.load_user_item_matrix_1m() T = MD.load_gender_vector_1m() importance = np.zeros(shape=(X.shape[1], )) for i in range(10): model = RandomForestClassifier() model.fit(X, T) importance += model.feature_importances_ importance /= 10 plt.bar(range(1, len(importance[0:30]) + 1), importance[0:30]) plt.xlabel("movie index") plt.ylabel("importance") plt.show() counter = 0 for movie, score in enumerate(importance): if score >= 0.002: print(movie + 1, end=",") counter += 1 print() print(counter) nr_ratings = np.zeros(shape=(X.shape[1], )) for index, movie in enumerate(np.transpose(X)): counter = 0 for rating in movie: if rating > 0: counter += 1 nr_ratings[index] = counter avg_nr_per_importance = {} nr_ratings_importance = [] for nr, imp in zip(nr_ratings, importance): if imp in avg_nr_per_importance: avg_nr_per_importance[imp].append(nr) else: avg_nr_per_importance[imp] = [nr] nr_ratings_importance.append([nr, imp]) #for key in avg_nr_per_importance.keys(): # avg_nr_per_importance[key] = np.average(avg_nr_per_importance[key]) #print(avg_nr_per_importance) plt.subplot(1, 2, 1) for nr, imp in nr_ratings_importance: plt.scatter(nr, imp) plt.xlabel("#ratings") plt.ylabel("importance") plt.subplot(1, 2, 2) for nr, imp in nr_ratings_importance: if nr < 100: plt.scatter(nr, imp) plt.xlabel("#ratings") plt.ylabel("importance") plt.show()
def one_million(classifier): max_user = 6040 max_item = 3952 X = MD.load_user_item_matrix_1m() # max_user=max_user, max_item=max_item) T = MD.load_occupation_vector_1m() # max_user=max_user) X = Utils.normalize(X) #print(T) #X = MD.feature_selection(X, T, f_regression) #X = MD.chi2_selection(X, T) classifier(X, T, multiclass=True)
def one_million(classifier): max_user = 6040 max_item = 3952 X = MD.load_user_item_matrix_1m() # max_user=max_user, max_item=max_item) T = MD.load_age_vector_1m(border=30) # max_user=max_user) #X = normalize(X) print(min(X[:,0]), np.mean(X[:,0])) #X = feature_selection(X, T, f_regression) #X = chi2_selection(X, T) classifier(X, T)
def movie_ratings(): X = MD.load_user_item_matrix_1m() freqs = [] for movie in np.transpose(X): freq = 0 for rating in movie: if rating > 0: freq+=1 freqs.append(freq) print(list(sorted(freqs))) print(set(freqs))
def lot_ratings(): X = MD.load_user_item_matrix_1m() X = Utils.normalize(X) T = MD.load_gender_vector_1m() X_train, T_train = X[0:int(0.8 * len(X))], T[0:int(0.8 * len(X))] X_test, T_test = X[int(0.8 * len(X)):], T[int(0.8 * len(X)):] test_data = list(zip(X_test, T_test)) from sklearn.linear_model import LogisticRegression random_state = np.random.RandomState(0) model = LogisticRegression(penalty='l2', random_state=random_state) model.fit(X_train, T_train) # Utils.ROC_plot(X_test, T_test, model) roc = True min_rating = [162, 211, 282, 390] for index, max_rating in enumerate([210, 281, 389, 1000]): selected_X = [] selected_T = [] for user, label in test_data: counter = 0 for rating in user: if rating > 0: counter += 1 if min_rating[index] <= counter <= max_rating: selected_X.append(user) selected_T.append(label) probs = model.predict_proba(selected_X) preds = probs[:, 1] fpr, tpr, threshold = metrics.roc_curve(selected_T, preds) roc_auc = metrics.auc(fpr, tpr) if roc: # method I: plt plt.subplot(2, 2, index + 1) plt.title( 'Receiver Operating Characteristic with users having rated between ' + str(max_rating) + " and " + str( min_rating[index]) + ' making N=' + str(len(selected_X))) plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % roc_auc) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], 'r--') plt.xlim([0, 1]) plt.ylim([0, 1]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') # print the confusion matrix: print("For max rating =", max_rating, ":") Y = model.predict(selected_X) TPR, TNR, FPR, FNR, precision, accuracy = Utils.performance_measures(Y, T) print("TPR:", TPR, "TNR:", TNR, "FPR:", FPR, "FNR:", FNR, "precision:", precision, "accuracy:", accuracy) if roc: plt.show()
def rating_exploration(): X = MD.load_user_item_matrix_1m() rating_distr = {} for index, user in enumerate(X): nr_ratings = 0 for rating in user: if rating > 0: nr_ratings += 1 if nr_ratings == 0: print(index, user) if nr_ratings > 200: continue if nr_ratings in rating_distr: rating_distr[nr_ratings] += 1 else: rating_distr[nr_ratings] = 1 print(rating_distr) plt.rcParams.update({'font.size': 22}) plt.bar(rating_distr.keys(), rating_distr.values()) plt.xlabel("#ratings per user") plt.ylabel("frequency") plt.show() rating_distr = {} for index, user in enumerate(X): for rating in user: if rating not in rating_distr: rating_distr[rating] = 1 else: rating_distr[rating] += 1 print(rating_distr) print(X.shape[0]*X.shape[1]) plt.bar(rating_distr.keys(), rating_distr.values()) plt.show() rating_distr = {} X = np.transpose(X) for index, item in enumerate(X): nr_ratings = 0 for rating in item: if rating > 0: nr_ratings += 1 if nr_ratings == 0: print(index, item) continue if nr_ratings in rating_distr: rating_distr[nr_ratings] += 1 else: rating_distr[nr_ratings] = 1 print(rating_distr) plt.bar(rating_distr.keys(), rating_distr.values()) plt.show()
def show_avg_rating_gender_per_movie(movie_id=1): gender_dict = MD.gender_user_dictionary_1m() user_item = MD.load_user_item_matrix_1m() ratings = user_item[:, movie_id] male_ratings = [] female_ratings = [] for user_id, rating in enumerate(ratings): if rating > 0: if gender_dict[user_id] == 'M': male_ratings.append(rating) else: female_ratings.append(rating) plt.bar(["male", "female"], [np.average(male_ratings), np.average(female_ratings)]) plt.show()
def loyal_ratings(): X = MD.load_user_item_matrix_1m() X = Utils.normalize(X) T = MD.load_gender_vector_1m() X_train, T_train = X[0:int(0.8 * len(X))], T[0:int(0.8 * len(X))] X_test, T_test = X[int(0.8 * len(X)):], T[int(0.8 * len(X)):] test_data = list(zip(X_test, T_test)) Test_indecies = range(int(0.8 * len(X)), len(X)) print(len(X_test)) from sklearn.linear_model import LogisticRegression random_state = np.random.RandomState(0) model = LogisticRegression(penalty='l2', random_state=random_state) model.fit(X_train, T_train) # Utils.ROC_plot(X_test, T_test, model) roc = True upper_bound = [0.17,0.19,0.42,1] for index, percent_loyal in enumerate([0.0, 0.17, 0.35, 0.42]): test_ids = [i+1 for i in Test_indecies] selected_ids = Utils.is_loyal(test_ids, loyal_percent_lower=percent_loyal, loyal_percent_upper=upper_bound[index]) selected_indecies = [i-1 for i in selected_ids] selected_X = X[selected_indecies] selected_T = T[selected_indecies] probs = model.predict_proba(selected_X) preds = probs[:, 1] fpr, tpr, threshold = metrics.roc_curve(selected_T, preds) roc_auc = metrics.auc(fpr, tpr) if roc: # method I: plt plt.subplot(2, 2, index + 1) plt.title('Receiver Operating Characteristic with users having a loyality between ' + str( percent_loyal) + ' and ' + str(upper_bound[index]) + ' making N=' + str(len(selected_X))) plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % roc_auc) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], 'r--') plt.xlim([0, 1]) plt.ylim([0, 1]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') # print the confusion matrix: print("For loyality =", percent_loyal, ":") Y = model.predict(selected_X) TPR, TNR, FPR, FNR, precision, accuracy = Utils.performance_measures(Y, T) print("TPR:", TPR, "TNR:", TNR, "FPR:", FPR, "FNR:", FNR, "precision:", precision, "accuracy:", accuracy) if roc: plt.show()
def rating_exploration_100k(): X = MD.load_user_item_matrix_1m() rating_distr = {} for index, user in enumerate(X): nr_ratings = 0 for rating in user: if rating > 0: nr_ratings += 1 if nr_ratings == 0: print(index, user) if nr_ratings in rating_distr: rating_distr[nr_ratings] += 1 else: rating_distr[nr_ratings] = 1 print(rating_distr) plt.bar(rating_distr.keys(), rating_distr.values()) plt.show() rating_distr = {} for index, user in enumerate(X): for rating in user: if rating not in rating_distr: rating_distr[rating] = 1 else: rating_distr[rating] += 1 print(rating_distr) print(X.shape[0] * X.shape[1]) plt.bar(rating_distr.keys(), rating_distr.values()) plt.show() rating_distr = {} X = np.transpose(X) for index, item in enumerate(X): nr_ratings = 0 for rating in item: if rating > 0: nr_ratings += 1 if nr_ratings == 0: print(index, item) continue if nr_ratings in rating_distr: rating_distr[nr_ratings] += 1 else: rating_distr[nr_ratings] = 1 print(rating_distr) plt.bar(rating_distr.keys(), rating_distr.values()) plt.show()
def test_avg_rating_gender_per_movie_1m(): import MovieLensData as MD from scipy.stats import ttest_ind, mannwhitneyu gender_dict = MD.gender_user_dictionary_1m() user_item = MD.load_user_item_matrix_1m() movies = {} with open("ml-1m/movies.dat", 'r') as f: for line in f.readlines(): id, name, genre = line.replace("\n", "").split("::") movies[int(id)] = name + "::" + genre counter = 0 print(len(user_item[0])) for movie_id in range(len(user_item[0])): ratings = user_item[:, movie_id] male_ratings = [] female_ratings = [] for user_id, rating in enumerate(ratings): if rating > 0: if gender_dict[user_id] == 'M': male_ratings.append(rating) else: female_ratings.append(rating) try: _, p_value = mannwhitneyu(male_ratings, female_ratings) if p_value < 0.05 / len(user_item[0]): #print(movie_id+1, "%.2f" % np.average(male_ratings), len(male_ratings), "%.2f" % np.average(female_ratings), len(female_ratings), p_value) counter += 1 #plt.bar(["male", "female"], [np.average(male_ratings), np.average(female_ratings)]) #plt.show() if np.average(male_ratings) > np.average(female_ratings): print( str(movie_id + 1) + "::" + movies[movie_id + 1] + "::M") if np.average(male_ratings) < np.average(female_ratings): print( str(movie_id + 1) + "::" + movies[movie_id + 1] + "::F") except: print("Testing failed for", movie_id) print(str(1 + 1) + "::" + movies[1]) print(counter)
def rating_swap_1m(): plot = False low_bound, high_bound = 100, 1500 # swap 0 ratings with non zero ratings: X = np.transpose(MD.load_user_item_matrix_1m()) X_obf = np.transpose(MD.load_user_item_matrix_1m()) nr_ratings = [] for item in X: nr_rating = 0 for rating in item: if rating > 0: nr_rating += 1 nr_ratings.append(nr_rating) fig, (ax1, ax2) = plt.subplots(ncols=2, sharey=True) if plot: #plt.subplot(1,2,1) ax1.bar(range(1,len(X)+1), nr_ratings) ax1.set_xlabel("movie id") ax1.set_ylabel("nr ratings") # we want to remove ratings from movies that have more than 1500 ratings: amount_removed = 0 for item_index, item in enumerate(X): if nr_ratings[item_index] > high_bound: indecies = np.argwhere(X[item_index,:] > 0)[:,0] indecies = np.random.choice(indecies, size=(nr_ratings[item_index]-high_bound,), replace=False) amount_removed += len(indecies) for i in indecies: X_obf[item_index, i] = 0 """ To check if the removal is working nr_ratings = [] for item in X_obf: nr_rating = 0 for rating in item: if rating > 0: nr_rating += 1 nr_ratings.append(nr_rating) if plot: plt.bar(range(1, len(X) + 1), nr_ratings) plt.xlabel("movie id") plt.ylabel("nr ratings") plt.show() """ # now we want to add ratings to movies with a small number of ratings: print(np.asarray(nr_ratings)) indecies = np.argwhere(np.asarray(nr_ratings) < low_bound)[:,0] print(indecies) nr_few_rated_movies = len(indecies) nr_to_be_added = amount_removed/nr_few_rated_movies print(nr_to_be_added) for item_index, item in enumerate(X): if nr_ratings[item_index] < low_bound: indecies = np.argwhere(X[item_index,:] == 0)[:,0] indecies = np.random.choice(indecies, size=(int(nr_to_be_added),), replace=False) for i in indecies: X_obf[item_index, i] = np.random.randint(1,6) """ To check if the removal and adding is working """ nr_ratings = [] for item in X_obf: nr_rating = 0 for rating in item: if rating > 0: nr_rating += 1 nr_ratings.append(nr_rating) if plot: #plt.subplot(1,2,2) ax2.bar(range(1, len(X) + 1), nr_ratings) ax2.set_xlabel("movie id") ax2.set_ylabel("nr ratings") plt.show() X_obf = np.transpose(X_obf) # output the data in a file: with open("ml-1m/rebalanced_(" + str(low_bound) + "," + str(high_bound) + ").dat", 'w') as f: for index_user, user in enumerate(X_obf): for index_movie, rating in enumerate(user): if rating > 0: f.write(str(index_user + 1) + "::" + str(index_movie + 1) + "::" + str( int(rating)) + "::000000000\n") return X_obf
def blurMe_1m(): sample_mode = list(['random', 'sampled', 'greedy'])[2] rating_mode = list(['highest', 'avg', 'pred'])[1] top = 10 X = MD.load_user_item_matrix_1m() # max_user=max_user, max_item=max_item) #X = Utils.normalize(X) avg_ratings = np.zeros(shape=X.shape[1]) for item_id in range(X.shape[1]): ratings = [] for rating in X[:, item_id]: if rating > 0: ratings.append(rating) if len(ratings) == 0: avg_ratings[item_id] = 0 else: avg_ratings[item_id] = np.average(ratings) # 1: get the set of most correlated movies, L_f and L_m: T = MD.load_gender_vector_1m() # max_user=max_user) X_train, T_train = X[0:int(0.8 * len(X))], T[0:int(0.8 * len(X))] X_test, T_test = X[int(0.8 * len(X)):], T[int(0.8 * len(X)):] from sklearn.model_selection import StratifiedKFold from sklearn.linear_model import LogisticRegression cv = StratifiedKFold(n_splits=10) coefs = [] for train, test in cv.split(X_train, T_train): x, t = X_train[train], T_train[train] random_state = np.random.RandomState(0) model = LogisticRegression(penalty='l2', random_state=random_state) model.fit(x, t) # rank the coefs: ranks = ss.rankdata(model.coef_[0]) coefs.append(ranks) coefs = np.average(coefs, axis=0) coefs = [[coefs[i], i+1] for i in range(len(coefs))] coefs = np.asarray(list(sorted(coefs))) L_m = coefs[:top, 1] L_f = coefs[coefs.shape[0]-top:, 1] L_f = list(reversed(L_f)) """ movie_dict = MD.load_movie_id_dictionary_1m() print("males") for id in L_m: print(movie_dict[int(id)]) print("females") for id in L_f: print(movie_dict[int(id)]) """ # Now, where we have the two lists, we can start obfuscating the data: X = MD.load_user_item_matrix_1m() X_obf = MD.load_user_item_matrix_1m() p = 0.05 prob_m = [p / sum(L_m) for p in L_m] prob_f = [p / sum(L_f) for p in L_f] for index, user in enumerate(X): k = 0 for rating in user: if rating > 0: k += 1 k *= p greedy_index = 0 #print(k) if T[index] == 1: added = 0 safety_counter = 0 while added < k and safety_counter < 100: # select a random movie: if sample_mode == 'random': movie_id = L_m[np.random.randint(0, len(L_m))] elif sample_mode == 'sampled': movie_id = L_m[np.random.choice(range(len(L_m)), p=prob_m)] elif sample_mode == 'greedy': movie_id = L_m[greedy_index] greedy_index += 1 if greedy_index >= len(L_m): safety_counter = 100 if X_obf[index, int(movie_id)-1] == 0: if rating_mode == 'higest': X_obf[index, int(movie_id) - 1] = 5 elif rating_mode == 'avg': X_obf[index, int(movie_id) - 1] = avg_ratings[int(movie_id)] added += 1 safety_counter += 1 elif T[index] == 0: added = 0 safety_counter = 0 while added < k and safety_counter < 100: # select a random movie: if sample_mode == 'random': movie_id = L_f[np.random.randint(0, len(L_f))] elif sample_mode == 'sampled': movie_id = L_f[np.random.choice(range(len(L_f)), p=prob_f)] elif sample_mode == 'greedy': movie_id = L_f[greedy_index] greedy_index += 1 if greedy_index >= len(L_f): safety_counter = 100 if X_obf[index, int(movie_id) - 1] == 0: if rating_mode == 'higest': X_obf[index, int(movie_id) - 1] = 5 elif rating_mode == 'avg': X_obf[index, int(movie_id) - 1] = avg_ratings[int(movie_id)] added += 1 safety_counter += 1 # output the data in a file: with open("ml-1m/blurme_obfuscated_" + str(p) + "_" + sample_mode + "_" + rating_mode + ".dat", 'w') as f: for index_user, user in enumerate(X_obf): for index_movie, rating in enumerate(user): if rating > 0: f.write(str(index_user+1) + "::" + str(index_movie+1) + "::" + str(int(rating)) + "::000000000\n") return X_obf
def find_good_threshold(): plt.rcParams.update({'font.size': 18}) import Classifiers import Utils max_user = 6040 max_item = 3952 # X = MD.load_user_item_matrix_1m_limited_ratings(limit=200) # max_user=max_user, max_item=max_item) X = MD.load_user_item_matrix_1m() T = MD.load_gender_vector_1m() # max_user=max_user) X_train, T_train = X[0:int(0.8 * len(X))], T[0:int(0.8 * len(X))] X_test, T_test = X[int(0.8 * len(X)):], T[int(0.8 * len(X)):] # print(X) # X = Utils.remove_significant_features(X, T) # X = feature_selection(X, T, Utils.select_male_female_different) # X = Utils.normalize(X) # X = Utils.standardize(X) # X = chi2_selection(X, T) precision = 20 begin, end = 0, 0.001 auc_rel = [] auc_irrel = [] std_rel = [] std_irrel = [] size_r = [] size_i = [] for t in np.linspace(begin, end, precision): print(X_train.shape) X_train_important, X_train_compl = Utils.random_forest_selection(X_train, T_train, threshold=t) print(X_train_important.shape) size_r.append(X_train_important.shape[1]) size_i.append(X_train_compl.shape[1]) mean_auc_r, std_auc_r = Classifiers.log_reg(X_train_important, T_train, show_plot=False) mean_auc_i, std_auc_i = Classifiers.log_reg(X_train_compl, T_train, show_plot=False) auc_rel.append(mean_auc_r) auc_irrel.append(mean_auc_i) std_rel.append(std_auc_r) std_irrel.append(std_auc_i) auc_rel, auc_irrel, std_rel, std_irrel = np.asarray(auc_rel), np.asarray(auc_irrel), np.asarray( std_rel), np.asarray(std_irrel) plt.subplot(1, 2, 1) plt.plot(np.linspace(begin, end, precision), auc_rel, c='b', label='AUC of important features') auc_upper = np.minimum(auc_rel + std_rel, 1) auc_lower = np.maximum(auc_rel - std_rel, 0) plt.fill_between(np.linspace(begin, end, precision), auc_lower, auc_upper, color='grey', alpha=.2) plt.plot(np.linspace(begin, end, precision), auc_irrel, c='r', label='AUC of not important features') auc_upper = np.minimum(auc_irrel + std_irrel, 1) auc_lower = np.maximum(auc_irrel - std_irrel, 0) plt.fill_between(np.linspace(begin, end, precision), auc_lower, auc_upper, color='grey', alpha=.2) plt.xlabel("Threshold") plt.ylabel("Mean AUC") plt.legend() plt.subplot(1, 2, 2) plt.plot(np.linspace(begin, end, precision), size_r, c='b', label='number of important movies') plt.plot(np.linspace(begin, end, precision), size_i, c='r', label='number of irrelevant movies') plt.xlabel("Threshold") plt.ylabel("#samples in data") plt.legend() plt.show()
def global_stats_ML_obf(): plt.rcParams.update({'font.size': 28}) original = MD.load_user_item_matrix_1m() blurme = MD.load_user_item_matrix_1m_masked(file_index=71) blurmore = MD.load_user_item_matrix_1m_masked(file_index=55) ratings = [1, 2, 3, 4, 5] rating_distribution = np.zeros(shape=(3,5)) nr_ratings = np.zeros(shape=(3, blurme.shape[0])) rating_distribution_new = np.zeros(shape=(5,)) """""" for user_index in range(blurme.shape[0]): for movie_index in range(blurme.shape[1]): #if original[user_index, movie_index] != blurme[user_index, movie_index]: # rating_distribution_new[int(blurme[user_index, movie_index]) - 1] += 1 if original[user_index, movie_index] in ratings: nr_ratings[0, user_index] += 1 rating_distribution[0, int(original[user_index, movie_index])-1] += 1 if blurme[user_index, movie_index] in ratings: nr_ratings[1, user_index] += 1 rating_distribution[1, int(blurme[user_index, movie_index])-1] += 1 if blurmore[user_index, movie_index] in ratings: nr_ratings[2, user_index] += 1 rating_distribution[2, int(blurmore[user_index, movie_index])-1] += 1 f, (ax1, ax2) = plt.subplots(1, 2, sharey=True) ax1.bar(range(1, 6), rating_distribution[1,:] - rating_distribution[0, :]) ax2.bar(range(1, 6), rating_distribution[2, :] - rating_distribution[0, :]) plt.show() print("new ratings:", rating_distribution) f, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=True) rating_distribution[0, :] /= sum(rating_distribution[0,:]) rating_distribution[1, :] /= sum(rating_distribution[1,:]) rating_distribution[2, :] /= sum(rating_distribution[2,:]) print(rating_distribution) nr_ratings[0, :] = list(reversed(sorted(nr_ratings[0, :]))) nr_ratings[1, :] = list(reversed(sorted(nr_ratings[1, :]))) nr_ratings[2, :] = list(reversed(sorted(nr_ratings[2, :]))) print(nr_ratings) ax1.bar(range(1, 6), rating_distribution[0, :]) ax1.set_xlabel("Rating") ax1.set_ylabel("Rating Frequency in %") ax1.set_title("Rating frequency \noriginal ML") ax2.bar(range(1, 6), rating_distribution[1, :]) ax2.set_xlabel("Rating") #ax2.set_ylabel("Rating Frequency") ax2.set_title("Rating frequency \nBlurMe") ax3.bar(range(1, 6), rating_distribution[2, :]) ax3.set_xlabel("Rating") #ax3.set_ylabel("Rating Frequency") ax3.set_title("Rating frequency \nBlurM(or)e") plt.show() f, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=True) ax1.bar(range(100), nr_ratings[0, 0:100]) ax2.bar(range(100), nr_ratings[1, 0:100]) ax3.bar(range(100), nr_ratings[2, 0:100]) plt.show() f, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=True) ax1.plot(range(300), nr_ratings[0, -300:]) ax2.plot(range(300), nr_ratings[1, -300:]) ax3.plot(range(300), nr_ratings[2, -300:]) plt.show()
def blurMe_1m(): sample_mode = list(['random', 'sampled', 'greedy'])[2] rating_mode = list(['highest', 'avg', 'pred'])[1] top = -1 p = 0.01 dataset = ['ML', 'Fx', 'Li'][0] if dataset == 'ML': X = MD.load_user_item_matrix_1m() # max_user=max_user, max_item=max_item) T = MD.load_gender_vector_1m() # max_user=max_user) elif dataset == 'Fx': import FlixsterData as FD X, T, _ = FD.load_flixster_data_subset() else: import LibimSeTiData as LD X, T, _ = LD.load_libimseti_data_subset() #X = Utils.normalize(X) avg_ratings = np.zeros(shape=X.shape[0]) for index, user in enumerate(X): ratings = [] for rating in user: if rating > 0: ratings.append(rating) if len(ratings) == 0: avg_ratings[index] = 0 else: avg_ratings[index] = np.average(ratings) """ AVERAGE ACROSS MOVIE avg_ratings = np.zeros(shape=X.shape[1]) for item_id in range(X.shape[1]): ratings = [] for rating in X[:, item_id]: if rating > 0: ratings.append(rating) if len(ratings) == 0: avg_ratings[item_id] = 0 else: avg_ratings[item_id] = np.average(ratings) """ # 1: get the set of most correlated movies, L_f and L_m: X_train, T_train = X[0:int(0.8 * len(X))], T[0:int(0.8 * len(X))] X_test, T_test = X[int(0.8 * len(X)):], T[int(0.8 * len(X)):] print("lists") from sklearn.model_selection import StratifiedKFold from sklearn.linear_model import LogisticRegression cv = StratifiedKFold(n_splits=10) coefs = [] avg_coefs = np.zeros(shape=(len(X_train[1]),)) random_state = np.random.RandomState(0) for train, test in cv.split(X_train, T_train): x, t = X_train[train], T_train[train] model = LogisticRegression(penalty='l2', random_state=random_state) model.fit(x, t) # rank the coefs: ranks = ss.rankdata(model.coef_[0]) coefs.append(ranks) #print(len(model.coef_[0]),len(X_train[0])) avg_coefs += model.coef_[0] coefs = np.average(coefs, axis=0) coefs = [[coefs[i], i+1, avg_coefs[i]] for i in range(len(coefs))] coefs = np.asarray(list(sorted(coefs))) if top == -1: values = coefs[:,2] index_zero = np.where(values == np.min(np.abs(values))) top_male = index_zero[0][0] top_female = index_zero[0][-1] L_m = coefs[:top_male, 1] R_m = 3952 - coefs[:top_male, 0] C_m = np.abs(coefs[:top_male, 2]) L_f = coefs[coefs.shape[0] - top_female:, 1] L_f = list(reversed(L_f)) R_f = coefs[coefs.shape[0] - top_female:, 0] R_f = list(reversed(R_f)) C_f = coefs[coefs.shape[0] - top_female:, 2] C_f = list(reversed(np.abs(C_f))) else: L_m = coefs[:top, 1] R_m = 3952-coefs[:top, 0] C_m = np.abs(coefs[:top, 2]) L_f = coefs[coefs.shape[0]-top:, 1] L_f = list(reversed(L_f)) R_f = coefs[coefs.shape[0]-top:, 0] R_f = list(reversed(R_f)) C_f = coefs[coefs.shape[0]-top:, 2] C_f = list(reversed(np.abs(C_f))) #print(R_f) """ id_index, index_id = MD.load_movie_id_index_dict() movies = [] with open("ml-1m/movies.dat", 'r') as f: for line in f.readlines(): movies.append(line.replace("\n", "")) for index, val in enumerate(L_m[0:10]): print(index, movies[id_index[int(val)]], C_m[index]) for index, val in enumerate(L_f[0:10]): print(index, movies[id_index[int(val)]], C_f[index]) movie_dict = MD.load_movie_id_dictionary_1m() print("males") for id in L_m: print(movie_dict[int(id)]) print("females") for id in L_f: print(movie_dict[int(id)]) """ print("obfuscation") # Now, where we have the two lists, we can start obfuscating the data: #X = MD.load_user_item_matrix_1m() X_obf = np.copy(X) #X = Utils.normalize(X) #X_obf = Utils.normalize(X_obf) prob_m = []#[p / sum(C_m) for p in C_m] prob_f = []#[p / sum(C_f) for p in C_f] print("obfuscation") for index, user in enumerate(X): print(index) k = 0 for rating in user: if rating > 0: k += 1 k *= p greedy_index = 0 #print(k) if T[index] == 1: added = 0 safety_counter = 0 while added < k and safety_counter < 100: # select a random movie: if sample_mode == 'random': movie_id = L_m[np.random.randint(0, len(L_m))] elif sample_mode == 'sampled': movie_id = L_m[np.random.choice(range(len(L_m)), p=prob_m)] elif sample_mode == 'greedy': movie_id = L_m[greedy_index] greedy_index += 1 if greedy_index >= len(L_m): safety_counter = 100 if X_obf[index, int(movie_id)-1] == 0: if rating_mode == 'higest': X_obf[index, int(movie_id) - 1] = 5 elif rating_mode == 'avg': X_obf[index, int(movie_id) - 1] = avg_ratings[int(index)] added += 1 safety_counter += 1 elif T[index] == 0: added = 0 safety_counter = 0 while added < k and safety_counter < 100: # select a random movie: if sample_mode == 'random': movie_id = L_f[np.random.randint(0, len(L_f))] elif sample_mode == 'sampled': movie_id = L_f[np.random.choice(range(len(L_f)), p=prob_f)] elif sample_mode == 'greedy': movie_id = L_f[greedy_index] greedy_index += 1 if greedy_index >= len(L_f): safety_counter = 100 if X_obf[index, int(movie_id) - 1] == 0: if rating_mode == 'higest': X_obf[index, int(movie_id) - 1] = 5 elif rating_mode == 'avg': X_obf[index, int(movie_id) - 1] = avg_ratings[int(index)] added += 1 safety_counter += 1 # output the data in a file: output_file = "" if dataset == 'ML': output_file = "ml-1m/" with open(output_file + "blurme_obfuscated_" + str(p) + "_" + sample_mode + "_" + rating_mode + "_top" + str( top) + ".dat", 'w') as f: for index_user, user in enumerate(X_obf): for index_movie, rating in enumerate(user): if rating > 0: f.write(str(index_user + 1) + "::" + str(index_movie + 1) + "::" + str( int(np.round(rating))) + "::000000000\n") elif dataset == 'Fx': import FlixsterData as FD output_file = "Flixster/" user_id2index, user_index2id = FD.load_user_id_index_dict() movie_id2index, movie_index2id = FD.load_movie_id_index_dict() with open(output_file + "FX_blurme_obfuscated_" + str(p) + "_" + sample_mode + "_" + rating_mode + "_top" + str( top) + ".dat", 'w') as f: for index_user, user in enumerate(X_obf): for index_movie, rating in enumerate(user): if rating > 0: f.write(str(user_index2id[index_user]) + "::" + str(movie_index2id[index_movie]) + "::" + str( int(np.round(rating))) + "::000000000\n") else: with open("libimseti/LST_blurme_obfuscated_" + str(p) + "_" + sample_mode + "_" + rating_mode + "_top" + str( top) + ".dat", 'w') as f: for index_user, user in enumerate(X_obf): for index_movie, rating in enumerate(user): if rating > 0: f.write(str(index_user+1) + "::" + str(index_movie+1) + "::" + str( int(np.round(rating))) + "::000000000\n") return X_obf
def blurMePP(): top = -1 sample_mode = list(['random', 'sampled', 'greedy'])[2] id_index, index_id = MD.load_movie_id_index_dict() notice_factor = 2 p = 0.1 dataset = ['ML', 'Fx', 'Li'][2] if dataset == 'ML': X = MD.load_user_item_matrix_1m() # max_user=max_user, max_item=max_item) T = MD.load_gender_vector_1m() # max_user=max_user) elif dataset == 'Fx': import FlixsterData as FD X, T, _ = FD.load_flixster_data_subset() else: import LibimSeTiData as LD X, T, _ = LD.load_libimseti_data_subset() # X = Utils.normalize(X) avg_ratings = np.zeros(shape=X.shape[1]) initial_count = np.zeros(shape=X.shape[1]) for item_id in range(X.shape[1]): ratings = [] for rating in X[:, item_id]: if rating > 0: ratings.append(rating) if len(ratings) == 0: avg_ratings[item_id] = 0 else: avg_ratings[item_id] = np.average(ratings) initial_count[item_id] = len(ratings) max_count = initial_count * notice_factor # 1: get the set of most correlated movies, L_f and L_m: from sklearn.model_selection import StratifiedKFold from sklearn.linear_model import LogisticRegression cv = StratifiedKFold(n_splits=10) coefs = [] avg_coefs = np.zeros(shape=(len(X[1]),)) random_state = np.random.RandomState(0) for train, test in cv.split(X, T): x, t = X[train], T[train] model = LogisticRegression(penalty='l2', random_state=random_state) model.fit(x, t) # rank the coefs: ranks = ss.rankdata(model.coef_[0]) coefs.append(ranks) # print(len(model.coef_[0]),len(X_train[0])) avg_coefs += model.coef_[0] coefs = np.average(coefs, axis=0) coefs = [[coefs[i], i + 1, avg_coefs[i]] for i in range(len(coefs))] coefs = np.asarray(list(sorted(coefs))) if top == -1: values = coefs[:,2] index_zero = np.where(values == np.min(np.abs(values))) top_male = index_zero[0][0] top_female = index_zero[0][-1] L_m = coefs[:top_male, 1] R_m = 3952 - coefs[:top_male, 0] C_m = np.abs(coefs[:top_male, 2]) L_f = coefs[coefs.shape[0] - top_female:, 1] L_f = list(reversed(L_f)) R_f = coefs[coefs.shape[0] - top_female:, 0] R_f = list(reversed(R_f)) C_f = coefs[coefs.shape[0] - top_female:, 2] C_f = list(reversed(np.abs(C_f))) else: L_m = coefs[:top, 1] R_m = 3952-coefs[:top, 0] C_m = np.abs(coefs[:top, 2]) L_f = coefs[coefs.shape[0]-top:, 1] L_f = list(reversed(L_f)) R_f = coefs[coefs.shape[0]-top:, 0] R_f = list(reversed(R_f)) C_f = coefs[coefs.shape[0]-top:, 2] C_f = list(reversed(np.abs(C_f))) # Now, where we have the two lists, we can start obfuscating the data: #X = MD.load_user_item_matrix_1m() #np.random.shuffle(X) #print(X.shape) X_obf = np.copy(X) total_added = 0 for index, user in enumerate(X): print(index) k = 0 for rating in user: if rating > 0: k += 1 k *= p greedy_index_m = 0 greedy_index_f = 0 # print(k) added = 0 if T[index] == 1: safety_counter = 0 while added < k and safety_counter < 1000: if greedy_index_m >= len(L_m): safety_counter = 1000 continue if sample_mode == 'greedy': movie_id = L_m[greedy_index_m] if sample_mode == 'random': movie_id = L_m[np.random.randint(0, len(L_m))] greedy_index_m += 1 rating_count = sum([1 if x > 0 else 0 for x in X_obf[:, int(movie_id)-1]]) if rating_count > max_count[int(movie_id)-1]: continue if X_obf[index, int(movie_id) - 1] == 0: X_obf[index, int(movie_id) - 1] = avg_ratings[int(movie_id) - 1] added += 1 safety_counter += 1 elif T[index] == 0: safety_counter = 0 while added < k and safety_counter < 1000: if greedy_index_f >= len(L_f): safety_counter = 1000 continue if sample_mode == 'greedy': movie_id = L_f[greedy_index_f] if sample_mode == 'random': movie_id = L_f[np.random.randint(0, len(L_f))] greedy_index_f += 1 rating_count = sum([1 if x > 0 else 0 for x in X_obf[:, int(movie_id) - 1]]) if rating_count > max_count[int(movie_id) - 1]: continue if X_obf[index, int(movie_id) - 1] == 0: X_obf[index, int(movie_id) - 1] = avg_ratings[int(movie_id) - 1] added += 1 safety_counter += 1 total_added += added # Now remove ratings from users that have more than 200 ratings equally: nr_many_ratings = 0 for user in X: rating_count = sum([1 if x > 0 else 0 for x in user]) if rating_count > 200: nr_many_ratings += 1 print(nr_many_ratings) nr_remove = total_added/nr_many_ratings for user_index, user in enumerate(X): rating_count = sum([1 if x > 0 else 0 for x in user]) if rating_count > 200: to_be_removed_indecies = np.random.choice(np.argwhere(user > 0)[:,0], size=(int(nr_remove),), replace=False) X_obf[user_index, to_be_removed_indecies] = 0 # finally, shuffle the user vectors: #np.random.shuffle(X_obf) # output the data in a file: output_file = "" if dataset == 'ML': output_file = "ml-1m/" with open(output_file + "blurmepp_obfuscated_" + sample_mode + "_" + str(p) + "_" + str(notice_factor) + ".dat", 'w') as f: for index_user, user in enumerate(X_obf): for index_movie, rating in enumerate(user): if rating > 0: f.write( str(index_user + 1) + "::" + str(index_movie + 1) + "::" + str( int(np.round(rating))) + "::000000000\n") elif dataset == 'Fx': import FlixsterData as FD output_file = "Flixster/" user_id2index, user_index2id = FD.load_user_id_index_dict() movie_id2index, movie_index2id = FD.load_movie_id_index_dict() with open(output_file + "FX_blurmepp_obfuscated_" + sample_mode + "_" + str(p) + "_" + str(notice_factor) + ".dat", 'w') as f: for index_user, user in enumerate(X_obf): for index_movie, rating in enumerate(user): if rating > 0: f.write(str(user_index2id[index_user]) + "::" + str(movie_index2id[index_movie]) + "::" + str( int(np.round(rating))) + "::000000000\n") else: with open("libimseti/LST_blurmepp_obfuscated_" + sample_mode + "_" + str(p) + "_" + str(notice_factor) + ".dat", 'w') as f: for index_user, user in enumerate(X_obf): for index_movie, rating in enumerate(user): if rating > 0: f.write(str(index_user+1) + "::" + str(index_movie+1) + "::" + str( int(np.round(rating))) + "::000000000\n") return X_obf
def blurMeBetter(): top = -1 sample_mode = list(['random', 'sampled', 'greedy'])[2] p = 0.05 id_index, index_id = MD.load_movie_id_index_dict() notice_factor = 2 certainty_threshold = 0.8 dataset = ['ML', 'Fx', 'Li'][0] if dataset == 'ML': X = MD.load_user_item_matrix_1m() # max_user=max_user, max_item=max_item) T = MD.load_gender_vector_1m() # max_user=max_user) elif dataset == 'Fx': import FlixsterData as FD X, T, _ = FD.load_flixster_data_subset() else: import LibimSeTiData as LD X, T, _ = LD.load_libimseti_data_subset() # X = Utils.normalize(X) avg_ratings = np.zeros(shape=X.shape[1]) initial_count = np.zeros(shape=X.shape[1]) for item_id in range(X.shape[1]): ratings = [] for rating in X[:, item_id]: if rating > 0: ratings.append(rating) if len(ratings) == 0: avg_ratings[item_id] = 0 else: avg_ratings[item_id] = np.average(ratings) initial_count[item_id] = len(ratings) max_count = initial_count * notice_factor # 1: get the set of most correlated movies, L_f and L_m: #X_train, T_train = X[0:int(0.8 * len(X))], T[0:int(0.8 * len(X))] #X_test, T_test = X[int(0.8 * len(X)):], T[int(0.8 * len(X)):] from sklearn.model_selection import StratifiedKFold from sklearn.linear_model import LogisticRegression cv = StratifiedKFold(n_splits=10) coefs = [] avg_coefs = np.zeros(shape=(len(X[1]),)) certainty = np.zeros(shape=(len(X),)) random_state = np.random.RandomState(0) for train, test in cv.split(X, T): x, t = X[train], T[train] model = LogisticRegression(penalty='l2', random_state=random_state) model.fit(x, t) # rank the coefs: ranks = ss.rankdata(model.coef_[0]) coefs.append(ranks) # print(len(model.coef_[0]),len(X_train[0])) avg_coefs += model.coef_[0] x_test = X[test] class_prob = np.max(model.predict_proba(x_test),axis=1) #correct, so that 1 means the classifier is very sure and 0 means it is not sure class_prob -= 0.5 class_prob *= 2 certainty[test] = class_prob # set certainty to 0 for all missclassifications: t_pred = model.predict(x_test) t_test = T[test] for index, (pred, target) in enumerate(zip(t_pred, t_test)): #print(pred, target, index, test) if pred != target: certainty[test[index]] = 0 """ plot certainty scores print("-------------------------") import matplotlib.pyplot as plt plt.bar(range(0,50), certainty[0:50]) plt.xlabel("user") plt.ylabel("certainty score") plt.show() """ coefs = np.average(coefs, axis=0) coefs = [[coefs[i], i + 1, avg_coefs[i]] for i in range(len(coefs))] coefs = np.asarray(list(sorted(coefs))) if top == -1: values = coefs[:, 2] index_zero = np.where(values == np.min(np.abs(values))) top_male = index_zero[0][0] top_female = index_zero[0][-1] L_m = coefs[:top_male, 1] R_m = 3952 - coefs[:top_male, 0] C_m = np.abs(coefs[:top_male, 2]) L_f = coefs[coefs.shape[0] - top_female:, 1] L_f = list(reversed(L_f)) R_f = coefs[coefs.shape[0] - top_female:, 0] R_f = list(reversed(R_f)) C_f = coefs[coefs.shape[0] - top_female:, 2] C_f = list(reversed(np.abs(C_f))) else: L_m = coefs[:top, 1] R_m = 3952 - coefs[:top, 0] C_m = np.abs(coefs[:top, 2]) L_f = coefs[coefs.shape[0] - top:, 1] L_f = list(reversed(L_f)) R_f = coefs[coefs.shape[0] - top:, 0] R_f = list(reversed(R_f)) C_f = coefs[coefs.shape[0] - top:, 2] C_f = list(reversed(np.abs(C_f))) # Now, where we have the two lists, we can start obfuscating the data: #X = MD.load_user_item_matrix_1m() # np.random.shuffle(X) X_obf = np.copy(X) total_added = 0 nr_skipped_users= 0 for index, user in enumerate(X): if certainty[index] < certainty_threshold: nr_skipped_users+=1 print(index, nr_skipped_users) continue k = 0 for rating in user: if rating > 0: k += 1 k *= p greedy_index = 0 # print(k) added = 0 if T[index] == 1: safety_counter = 0 while added < k and safety_counter < 1000: if greedy_index >= len(L_m): safety_counter = 1000 continue if sample_mode == 'greedy': movie_id = L_m[greedy_index] if sample_mode == 'random': movie_id = L_m[np.random.randint(0, len(L_m))] greedy_index += 1 rating_count = sum([1 if x > 0 else 0 for x in X_obf[:, int(movie_id) - 1]]) if rating_count > max_count[int(movie_id) - 1]: continue if X_obf[index, int(movie_id) - 1] == 0: X_obf[index, int(movie_id) - 1] = avg_ratings[int(movie_id) - 1] added += 1 safety_counter += 1 elif T[index] == 0: safety_counter = 0 while added < k and safety_counter < 1000: if greedy_index >= len(L_f): safety_counter = 1000 continue if sample_mode == 'greedy': movie_id = L_f[greedy_index] if sample_mode == 'random': movie_id = L_f[np.random.randint(0, len(L_f))] greedy_index += 1 rating_count = sum([1 if x > 0 else 0 for x in X_obf[:, int(movie_id) - 1]]) if rating_count > max_count[int(movie_id) - 1]: continue if X_obf[index, int(movie_id) - 1] == 0: X_obf[index, int(movie_id) - 1] = avg_ratings[int(movie_id) - 1] added += 1 safety_counter += 1 total_added += added print("nr of skipped users:", nr_skipped_users) # Now remove ratings from users that have more than 200 ratings equally: nr_many_ratings = 0 for user in X: rating_count = sum([1 if x > 0 else 0 for x in user]) if rating_count > 200: nr_many_ratings += 1 nr_remove = total_added / nr_many_ratings for user_index, user in enumerate(X): rating_count = sum([1 if x > 0 else 0 for x in user]) if rating_count > 200: to_be_removed_indecies = np.random.choice(np.argwhere(user > 0)[:, 0], size=(int(nr_remove),), replace=False) X_obf[user_index, to_be_removed_indecies] = 0 # finally, shuffle the user vectors: # np.random.shuffle(X_obf) # output the data in a file: output_file = "" if dataset == 'ML': output_file = "ml-1m/" with open(output_file + "blurmebetter_obfuscated_" + sample_mode + "_" + str(p) + "_" + str( notice_factor) + "_c" + str(certainty_threshold) + ".dat", 'w') as f: for index_user, user in enumerate(X_obf): for index_movie, rating in enumerate(user): if rating > 0: f.write( str(index_user + 1) + "::" + str(index_movie + 1) + "::" + str( int(np.round(rating))) + "::000000000\n") elif dataset == 'Fx': import FlixsterData as FD output_file = "Flixster/" user_id2index, user_index2id = FD.load_user_id_index_dict() movie_id2index, movie_index2id = FD.load_movie_id_index_dict() with open(output_file + "FX_blurmebetter_obfuscated_" + sample_mode + "_" + str(p) + "_" + str( notice_factor) + "_c" + str(certainty_threshold) + ".dat", 'w') as f: for index_user, user in enumerate(X_obf): for index_movie, rating in enumerate(user): if rating > 0: f.write(str(user_index2id[index_user]) + "::" + str(movie_index2id[index_movie]) + "::" + str( int(np.round(rating))) + "::000000000\n") else: with open("libimseti/LST_blurmebetter_obfuscated_" + sample_mode + "_" + str(p) + "_" + str( notice_factor) + "_c" + str(certainty_threshold) + ".dat", 'w') as f: for index_user, user in enumerate(X_obf): for index_movie, rating in enumerate(user): if rating > 0: f.write(str(index_user+1) + "::" + str(index_movie+1) + "::" + str( int(np.round(rating))) + "::000000000\n") return X_obf