def show_gender_genre_comparison(): plt.rcParams.update({'font.size': 28}) # This plot shows the genres = ["Action", "Adventure", "Animation", "Children\'s", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"] movie_genre = MD.load_movie_genre_matrix_1m() male_genre = np.zeros(shape=(len(genres, ))) female_genre = np.zeros(shape=(len(genres, ))) user_gender_dict = MD.gender_user_dictionary_1m() user_genre = MD.load_user_genre_matrix_1m() for user_index, user in enumerate(user_genre): if user_gender_dict[user_index] == "M": male_genre += user else: female_genre += user print(male_genre) print(female_genre) x = np.arange(len(genres)) ax = plt.subplot(111) ax.bar(x-0.2, male_genre/750000, width=0.4, label='male') ax.bar(x+0.2, female_genre/250000, width=0.4, label='female') plt.xticks(x, ("Action", "Adventure", "Animation", "Children\'s", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western")) plt.legend() plt.tight_layout() plt.setp(ax.get_xticklabels(), rotation=40) plt.ylabel("Normalized rating count") plt.xlabel("Genres") plt.show()
def rating_add_1m(): # add a percentage of random ratings to a user: X = MD.load_user_item_matrix_1m() X_obf = MD.load_user_item_matrix_1m() percentage = 0.05 for user_index, user in enumerate(X): nr_ratings = 0 for rating in user: if rating > 0: nr_ratings += 1 added = 0 safety_counter = 0 while added < nr_ratings*percentage and safety_counter < 100: index = np.random.randint(0,len(user)) if X_obf[user_index, index] > 0: safety_counter += 1 continue else: X_obf[user_index, index] = np.random.randint(1,6) # output the data in a file: with open("ml-1m/random_added_obfuscated_" + str(percentage) + ".dat", 'w') as f: for index_user, user in enumerate(X_obf): for index_movie, rating in enumerate(user): if rating > 0: f.write(str(index_user + 1) + "::" + str(index_movie + 1) + "::" + str( int(rating)) + "::000000000\n") return X_obf
def avg_rating_diff(): X = MD.load_user_item_matrix_100k() T = MD.load_gender_vector_100k() name_dict = MD.load_movie_id_dictionary_100k() males_indecies = np.argwhere(T==0)[:,0] females_indecies = np.argwhere(T == 1)[:, 0] differences = np.zeros(shape=X.shape[1],) for movie_index, movie in enumerate(np.transpose(X)): avg_m = [] male_ratings = movie[males_indecies] avg_f = [] female_ratings = movie[females_indecies] for m_r, f_r in zip(male_ratings, female_ratings): if m_r > 0: avg_m.append(m_r) if f_r > 0: avg_f.append(f_r) avg_m = np.average(avg_m) avg_f = np.average(avg_f) if not (np.isnan(avg_m) or np.isnan(avg_f)): differences[movie_index] = avg_m-avg_f differences = [[differences[index], index] for index in range(differences.shape[0])] differences = np.asarray(list(reversed(sorted(differences)))) print(differences[0:20,1]) names = [name_dict[index+1] for index in np.concatenate((differences[0:20,1], differences[-20:,1]))] print(names) fig, ax = plt.subplots() ax.barh(range(40), np.concatenate((differences[0:20,0], differences[-20:,0]), axis=0), align='center') ax.set_yticks(range(40)) ax.set_yticklabels(names) ax.invert_yaxis() # labels read top-to-bottom ax.set_xlabel('Difference') ax.set_title('Rating difference between males and females') plt.show()
def one_million_obfuscated(classifier): #X2 = MD.load_user_item_matrix_1m() # max_user=max_user, max_item=max_item) T = MD.load_gender_vector_1m() # max_user=max_user) X1 = MD.load_user_item_matrix_1m() X2 = MD.load_user_item_matrix_1m_masked( file_index=55) # max_user=max_user, max_item=max_item) #X2 = X1 print(X1.shape, X2.shape) #X1, T = Utils.balance_data(X1, T) #X2, T2 = Utils.balance_data(X2, T) #X1 = Utils.normalize(X1) #X2 = Utils.normalize(X2) X_train, T_train = X1[0:int(0.8 * len(X1))], T[0:int(0.8 * len(X1))] X_test, T_test = X2[int(0.8 * len(X2)):], T[int(0.8 * len(X2)):] print(list(X1[0, :])) print(list(X2[0, :])) # print(X) print("before", X_train.shape) # X = Utils.remove_significant_features(X, T) # X_train, _ = Utils.random_forest_selection(X_train, T_train) # X = feature_selection(X, T, Utils.select_male_female_different) print(X_train.shape) from sklearn.linear_model import LogisticRegression random_state = np.random.RandomState(0) model = LogisticRegression(penalty='l2', random_state=random_state) Utils.ROC_cv_obf(X1, X2, T, model) model = LogisticRegression(penalty='l2', random_state=random_state)
def comp_BM_and_BMpp(): plt.rcParams.update({'font.size': 28}) f, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=True) interval_start, interval_end = 0, 50 X = MD.load_user_item_matrix_1m() X_movie_count = [sum([1 if x > 0 else 0 for x in movie]) for movie in np.transpose(X)[interval_start:interval_end]] movie_count_o = np.asarray(X_movie_count) ax1.bar(range(interval_start,interval_end), X_movie_count) ax1.set_title("(A)\nOriginal data") ax1.set_xlabel("movie ID") ax1.set_ylabel("#ratings") #ax1.set_xticks(range(1,6), [1,2,3,4,5]) print("Original Data:", sum(X_movie_count)) X = MD.load_user_item_matrix_1m_masked(file_index=63) X_movie_count = [sum([1 if x > 0 else 0 for x in movie]) for movie in np.transpose(X)[interval_start:interval_end]] masked_count = np.asarray(X_movie_count) ax2.bar(range(interval_start, interval_end), X_movie_count) ax2.set_title("(B)\nBlurMe data") ax2.set_xlabel("movie ID") ax2.set_ylabel("#ratings") print("BlurMe Data:", sum(X_movie_count)) X = MD.load_user_item_matrix_1m_masked(file_index=75) X_movie_count = [sum([1 if x > 0 else 0 for x in movie]) for movie in np.transpose(X)[interval_start:interval_end]] masked_count2 = np.asarray(X_movie_count) ax3.bar(range(interval_start, interval_end), X_movie_count) ax3.set_title("(C)\nBlurM(or)e data") ax3.set_xlabel("movie ID") ax3.set_ylabel("#ratings") print("BlurMe++ Data:", sum(X_movie_count)) print(movie_count_o-masked_count) print(masked_count-masked_count2) plt.show()
def one_hundert_k(classifier): X = MD.load_user_item_matrix_100k() # max_user=max_user, max_item=max_item) #X = normalize(X) T = MD.load_age_vector_100k() # max_user=max_user) #X = chi2_selection(X, T) classifier(X, T)
def one_hundert_k(classifier): X = MD.load_user_item_matrix_100k( ) # max_user=max_user, max_item=max_item) T = MD.load_occupation_vector_100k() # max_user=max_user) #X = MD.chi2_selection(X, T) classifier(X, T, multiclass=True)
def one_hundert_k_obfuscated(classifier): X = MD.load_user_item_matrix_100k_masked( ) # max_user=max_user, max_item=max_item) T = MD.load_gender_vector_100k() # max_user=max_user) #X = MD.chi2_selection(X, T) classifier(X, T)
def one_million(classifier): max_user = 6040 max_item = 3952 #X = MD.load_user_item_matrix_1m() # max_user=max_user, max_item=max_item) X = MD.load_user_item_matrix_1m_masked(file_index=71) T = MD.load_user_genre_matrix_1m(one_hot=True, top=5) T = np.argwhere(T==1)[:,1] print(min(T), max(T)) """ Note that we loose class 13 (Romance. it seems that no one has romance as favourite genre. This kinda makes sense because it correlates so much with drama and comedy. """ import collections import matplotlib.pyplot as plt counter = collections.Counter(T) #plt.bar(counter.keys(), counter.values()) #plt.xlabel("T") #plt.ylabel('frequency') #plt.show() print(counter) X = Utils.normalize(X) #print(T) #X = MD.feature_selection(X, T, f_regression) #X = MD.chi2_selection(X, T) classifier(X, T, multiclass=True, nr_classes=17)
def one_million(classifier): X = MD.load_user_item_matrix_1m() # max_user=max_user, max_item=max_item) #X = MD.load_user_item_matrix_1m_limited_ratings(limit=1) #X = MD.load_user_item_matrix_1m_binary() # X = MD.load_user_genre_matrix_100k_obfuscated() T = MD.load_gender_vector_1m() # max_user=max_user) #X, T = Utils.balance_data(X, T) #X = Utils.normalize(X) X = feature_selection(X, T, Utils.select_male_female_different) X_train, T_train = X[0:int(0.8 * len(X))], T[0:int(0.8 * len(X))] X_test, T_test = X[int(0.8 * len(X)):], T[int(0.8 * len(X)):] # print(X) print("before", X_train.shape) # X = Utils.remove_significant_features(X, T) #X_train, _ = Utils.random_forest_selection(X_train, T_train) # X = feature_selection(X, T, Utils.select_male_female_different) print(X_train.shape) # X = Utils.normalize(X) # X = Utils.standardize(X) # X = chi2_selection(X, T) classifier(X_train, T_train) from sklearn.linear_model import LogisticRegression random_state = np.random.RandomState(0) #model = Models.Dominant_Class_Classifier() model = LogisticRegression(penalty='l2', random_state=random_state) model.fit(X_train, T_train) Utils.ROC_plot(X_test, T_test, model)
def test_avg_rating_gender_per_movie_100k(): import MovieLensData as MD from scipy.stats import ttest_ind, mannwhitneyu gender_vec = MD.load_gender_vector_100k() user_item = MD.load_user_item_matrix_100k() movies = {} with open("ml-100k/u.item", 'r') as f: for line in f.readlines(): i1 = line.find("|") id = line[:i1] i2 = line.find("|", i1 + 1) name = line[i1 + 1:i2] movies[int(id)] = name counter = 0 print(len(user_item[0])) for movie_id in range(len(user_item[0])): ratings = user_item[:, movie_id] male_ratings = [] female_ratings = [] for user_id, rating in enumerate(ratings): if rating > 0: if gender_vec[user_id] == 0: male_ratings.append(rating) else: female_ratings.append(rating) try: if len(male_ratings) == 0: male_ratings = np.array([0]) if len(female_ratings) == 0: female_ratings = np.array([0]) if np.average(male_ratings) == np.average(female_ratings): continue _, p_value = ttest_ind(male_ratings, female_ratings) #print(p_value) if p_value < (0.05 / len(user_item[0])): #print(movie_id+1, "%.2f" % np.average(male_ratings), len(male_ratings), "%.2f" % np.average(female_ratings), len(female_ratings), p_value) counter += 1 #print(np.average(male_ratings) , np.average(female_ratings)) #print(male_ratings, female_ratings) #plt.bar(["male", "female"], [np.average(male_ratings), np.average(female_ratings)]) #plt.show() if np.average(male_ratings) > np.average(female_ratings): print( str(movie_id + 1) + "::" + movies[movie_id + 1] + "::M") if np.average(male_ratings) < np.average(female_ratings): print( str(movie_id + 1) + "::" + movies[movie_id + 1] + "::F") except: print(male_ratings, female_ratings) print("Testing failed for", movie_id) continue print("counter", counter)
def feature_importance_1m(): plt.rcParams.update({'font.size': 18}) from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression import seaborn as sns X = MD.load_user_item_matrix_1m() T = MD.load_gender_vector_1m() importance = np.zeros(shape=(X.shape[1],)) importance2 = np.zeros(shape=(X.shape[1],)) for i in range(10): model = LogisticRegression() model2 = RandomForestClassifier() model.fit(X, T) model2.fit(X,T) importance += model.coef_[0] importance2 += model2.feature_importances_ importance /= 10 importance2 /= 10 #plt.bar(range(1,len(importance[0:30])+1), importance[0:30]) #plt.xlabel("movie index") #plt.ylabel("importance") #plt.show() #sns.distplot(importance, kde=False) plt.hist(importance2, bins=np.linspace(0,0.001,50)) #sns.kdeplot(importance,shade=True,cut=0) #sns.rugplot(importance) plt.xlabel("importance") plt.ylabel("frequency") plt.title("Importance of movies distribution") plt.show() importance_id = zip(importance, range(1,len(importance)+1)) importance_id = list(reversed(sorted(importance_id))) importance_id2 = zip(importance, range(1, len(importance) + 1)) importance_id2 = list(sorted(importance_id2)) importance_id3 = zip(importance2, range(1, len(importance2) + 1)) importance_id3 = list(reversed(sorted(importance_id3))) set1 = set() set2 = set() set3 = set() names = MD.load_movie_id_dictionary_1m() top = 100 for (_, id), (_,id2), (_,id3) in zip(importance_id[0:top], importance_id2[0:top], importance_id3[0:top]): print(names[id], "|", names[id2], "|", names[id3]) set1.add(names[id]) set2.add(names[id2]) set3.add(names[id3]) #print(set3) print(set3.intersection(set2.union(set1))) #print(importance_id) """
def rating_distr(): T = MD.load_gender_vector_1m() X = MD.load_user_item_matrix_1m() f, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=True) frequencies = np.zeros(shape=(6,)) ratings = [] movie_ids = [] for user in X: for index, rating in enumerate(user): frequencies[int(rating)] += 1 if rating > 0: movie_ids.append(index+1) ratings.append(rating) print(frequencies) print(sum(frequencies[1:]), np.mean(frequencies[1:]), np.var(frequencies[1:])) print("mean:", np.dot(np.arange(0, 6), frequencies) / sum(frequencies), "without 0:", np.dot(np.arange(1, 6), frequencies[1:]) / sum(frequencies[1:])) print("Avg", np.average(ratings), "var", np.var(ratings)) print(len(set(movie_ids))) ax1.bar(range(5), frequencies[1:]) ax1.set_xlabel("Original") X = MD.load_user_item_matrix_1m_masked(file_index=71)# greedy 10% frequencies = np.zeros(shape=(6,)) ratings = [] for user in X: for rating in user: frequencies[int(rating)] += 1 if rating > 0: ratings.append(rating) print(frequencies) print(sum(frequencies[1:]), np.mean(frequencies[1:]), np.var(frequencies[1:])) print("mean:", np.dot(np.arange(0, 6), frequencies) / sum(frequencies), "without 0:", np.dot(np.arange(1, 6), frequencies[1:]) / sum(frequencies[1:])) print("Avg", np.average(ratings), "var", np.var(ratings)) ax2.bar(range(5), frequencies[1:]) ax2.set_xlabel("BlurMe") X = MD.load_user_item_matrix_1m_masked(file_index=55)# BlurMe++ 10%, fac=2 frequencies = np.zeros(shape=(6,)) ratings = [] for user in X: for rating in user: frequencies[int(rating)] += 1 if rating > 0: ratings.append(rating) print(frequencies) print(sum(frequencies[1:]), np.mean(frequencies[1:]), np.var(frequencies[1:])) print("mean:", np.dot(np.arange(0, 6), frequencies) / sum(frequencies), "without 0:", np.dot(np.arange(1, 6), frequencies[1:]) / sum(frequencies[1:])) print("Avg", np.average(ratings), "var", np.var(ratings)) ax3.bar(range(5), frequencies[1:]) ax3.set_xlabel("BlurMe++") plt.show()
def feature_importance_1m(): from sklearn.ensemble import RandomForestClassifier X = MD.load_user_item_matrix_1m() T = MD.load_gender_vector_1m() importance = np.zeros(shape=(X.shape[1], )) for i in range(10): model = RandomForestClassifier() model.fit(X, T) importance += model.feature_importances_ importance /= 10 plt.bar(range(1, len(importance[0:30]) + 1), importance[0:30]) plt.xlabel("movie index") plt.ylabel("importance") plt.show() counter = 0 for movie, score in enumerate(importance): if score >= 0.002: print(movie + 1, end=",") counter += 1 print() print(counter) nr_ratings = np.zeros(shape=(X.shape[1], )) for index, movie in enumerate(np.transpose(X)): counter = 0 for rating in movie: if rating > 0: counter += 1 nr_ratings[index] = counter avg_nr_per_importance = {} nr_ratings_importance = [] for nr, imp in zip(nr_ratings, importance): if imp in avg_nr_per_importance: avg_nr_per_importance[imp].append(nr) else: avg_nr_per_importance[imp] = [nr] nr_ratings_importance.append([nr, imp]) #for key in avg_nr_per_importance.keys(): # avg_nr_per_importance[key] = np.average(avg_nr_per_importance[key]) #print(avg_nr_per_importance) plt.subplot(1, 2, 1) for nr, imp in nr_ratings_importance: plt.scatter(nr, imp) plt.xlabel("#ratings") plt.ylabel("importance") plt.subplot(1, 2, 2) for nr, imp in nr_ratings_importance: if nr < 100: plt.scatter(nr, imp) plt.xlabel("#ratings") plt.ylabel("importance") plt.show()
def one_million(classifier): max_user = 6040 max_item = 3952 X = MD.load_user_item_matrix_1m() # max_user=max_user, max_item=max_item) T = MD.load_occupation_vector_1m() # max_user=max_user) X = Utils.normalize(X) #print(T) #X = MD.feature_selection(X, T, f_regression) #X = MD.chi2_selection(X, T) classifier(X, T, multiclass=True)
def one_million(classifier): max_user = 6040 max_item = 3952 X = MD.load_user_item_matrix_1m() # max_user=max_user, max_item=max_item) T = MD.load_age_vector_1m(border=30) # max_user=max_user) #X = normalize(X) print(min(X[:,0]), np.mean(X[:,0])) #X = feature_selection(X, T, f_regression) #X = chi2_selection(X, T) classifier(X, T)
def lot_ratings(): X = MD.load_user_item_matrix_1m() X = Utils.normalize(X) T = MD.load_gender_vector_1m() X_train, T_train = X[0:int(0.8 * len(X))], T[0:int(0.8 * len(X))] X_test, T_test = X[int(0.8 * len(X)):], T[int(0.8 * len(X)):] test_data = list(zip(X_test, T_test)) from sklearn.linear_model import LogisticRegression random_state = np.random.RandomState(0) model = LogisticRegression(penalty='l2', random_state=random_state) model.fit(X_train, T_train) # Utils.ROC_plot(X_test, T_test, model) roc = True min_rating = [162, 211, 282, 390] for index, max_rating in enumerate([210, 281, 389, 1000]): selected_X = [] selected_T = [] for user, label in test_data: counter = 0 for rating in user: if rating > 0: counter += 1 if min_rating[index] <= counter <= max_rating: selected_X.append(user) selected_T.append(label) probs = model.predict_proba(selected_X) preds = probs[:, 1] fpr, tpr, threshold = metrics.roc_curve(selected_T, preds) roc_auc = metrics.auc(fpr, tpr) if roc: # method I: plt plt.subplot(2, 2, index + 1) plt.title( 'Receiver Operating Characteristic with users having rated between ' + str(max_rating) + " and " + str( min_rating[index]) + ' making N=' + str(len(selected_X))) plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % roc_auc) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], 'r--') plt.xlim([0, 1]) plt.ylim([0, 1]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') # print the confusion matrix: print("For max rating =", max_rating, ":") Y = model.predict(selected_X) TPR, TNR, FPR, FNR, precision, accuracy = Utils.performance_measures(Y, T) print("TPR:", TPR, "TNR:", TNR, "FPR:", FPR, "FNR:", FNR, "precision:", precision, "accuracy:", accuracy) if roc: plt.show()
def one_hundert_k_obfuscated(classifier): X1 = MD.load_user_item_matrix_100k() X2 = MD.load_user_item_matrix_100k_masked(file_index=1) # max_user=max_user, max_item=max_item) X3 = MD.load_user_item_matrix_100k_masked(file_index=-1) T = MD.load_gender_vector_100k() # max_user=max_user) X_train, T_train = X3[0:int(0.8 * len(X3))], T[0:int(0.8 * len(X3))] X_test, T_test = X1[int(0.8 * len(X1)):], T[int(0.8 * len(X1)):] from sklearn.linear_model import LogisticRegression random_state = np.random.RandomState(0) model = LogisticRegression(penalty='l2', random_state=random_state) model.fit(X_train, T_train) Utils.ROC_plot(X_test, T_test, model)
def plot_genre_1m(): genres = ["Action", "Adventure", "Animation", "Children\'s", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"] f, (ax1, ax2) = plt.subplots(1, 2, sharey=True) movie_genre = MD.load_movie_genre_matrix_1m(combine=False) ax1.bar(genres, np.sum(movie_genre, axis=0)) ax1.set_title("genre distribution in ML 1m") plt.setp(ax1.get_xticklabels(), rotation=-45, ha="left") movie_genre = MD.load_movie_genre_matrix_1m(combine=True) ax2.bar(genres, np.sum(movie_genre, axis=0)) ax2.set_title("genre distribution in ML 1m, Drama&Romance are combined to Drama ect.") plt.setp(ax2.get_xticklabels(), rotation=-45, ha="left") plt.show()
def show_avg_rating_gender_per_movie(movie_id=1): gender_dict = MD.gender_user_dictionary_1m() user_item = MD.load_user_item_matrix_1m() ratings = user_item[:, movie_id] male_ratings = [] female_ratings = [] for user_id, rating in enumerate(ratings): if rating > 0: if gender_dict[user_id] == 'M': male_ratings.append(rating) else: female_ratings.append(rating) plt.bar(["male", "female"], [np.average(male_ratings), np.average(female_ratings)]) plt.show()
def loyal_vs_diverse(): #X = MD.load_user_item_matrix_1m() #T = MD.load_gender_vector_1m() genres = [ "Action", "Adventure", "Animation", "Children\'s", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western" ] movie_genre = MD.load_movie_genre_matrix_1m(combine=True) user_genre_distr = np.zeros(shape=(6040, movie_genre.shape[1])) user_gender_dict = MD.gender_user_dictionary_1m() print(user_genre_distr.shape) with open("ml-1m/ratings.dat", 'r') as f: for line in f.readlines(): user_id, movie_id, rating, _ = line.split("::") movie_id = int(movie_id) - 1 user_id = int(user_id) - 1 user_genre_distr[user_id, :] += movie_genre[movie_id, :] loyal_percents = [0.5, 0.6, 0.7] for i, loyal_percent in enumerate(loyal_percents): loyal_count = 0 for user_index, user in enumerate(user_genre_distr): if max(user) / sum(user) > loyal_percent: if True: #print the user: print(user_gender_dict[user_index]) top_5_index = user.argsort()[-5:][::-1] for index in top_5_index: print(genres[index], user[index]) loyal_count += 1 print("For threshold", loyal_percent, ",", loyal_count, "users are considered loyal") if True: user_loyalty_male = [] user_loyalty_female = [] for user_index, user in enumerate(user_genre_distr): loyalty = max(user) / sum(user) if user_gender_dict[user_index] == 'M': user_loyalty_male.append(loyalty) plt.scatter(user_index, loyalty, c='b') else: user_loyalty_female.append(loyalty) plt.scatter(user_index, loyalty, c='r') print(np.average(user_loyalty_male)) print(np.average(user_loyalty_female))
def loyal_ratings(): X = MD.load_user_item_matrix_1m() X = Utils.normalize(X) T = MD.load_gender_vector_1m() X_train, T_train = X[0:int(0.8 * len(X))], T[0:int(0.8 * len(X))] X_test, T_test = X[int(0.8 * len(X)):], T[int(0.8 * len(X)):] test_data = list(zip(X_test, T_test)) Test_indecies = range(int(0.8 * len(X)), len(X)) print(len(X_test)) from sklearn.linear_model import LogisticRegression random_state = np.random.RandomState(0) model = LogisticRegression(penalty='l2', random_state=random_state) model.fit(X_train, T_train) # Utils.ROC_plot(X_test, T_test, model) roc = True upper_bound = [0.17,0.19,0.42,1] for index, percent_loyal in enumerate([0.0, 0.17, 0.35, 0.42]): test_ids = [i+1 for i in Test_indecies] selected_ids = Utils.is_loyal(test_ids, loyal_percent_lower=percent_loyal, loyal_percent_upper=upper_bound[index]) selected_indecies = [i-1 for i in selected_ids] selected_X = X[selected_indecies] selected_T = T[selected_indecies] probs = model.predict_proba(selected_X) preds = probs[:, 1] fpr, tpr, threshold = metrics.roc_curve(selected_T, preds) roc_auc = metrics.auc(fpr, tpr) if roc: # method I: plt plt.subplot(2, 2, index + 1) plt.title('Receiver Operating Characteristic with users having a loyality between ' + str( percent_loyal) + ' and ' + str(upper_bound[index]) + ' making N=' + str(len(selected_X))) plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % roc_auc) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], 'r--') plt.xlim([0, 1]) plt.ylim([0, 1]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') # print the confusion matrix: print("For loyality =", percent_loyal, ":") Y = model.predict(selected_X) TPR, TNR, FPR, FNR, precision, accuracy = Utils.performance_measures(Y, T) print("TPR:", TPR, "TNR:", TNR, "FPR:", FPR, "FNR:", FNR, "precision:", precision, "accuracy:", accuracy) if roc: plt.show()
def is_loyal(user_ids, loyal_percent_lower=0.4, loyal_percent_upper=1): import MovieLensData as MD # X = MD.load_user_item_matrix_1m() # T = MD.load_gender_vector_1m() genres = [ "Action", "Adventure", "Animation", "Children\'s", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western" ] movie_genre = MD.load_movie_genre_matrix_1m(combine=True) user_genre_distr = np.zeros(shape=(6040, movie_genre.shape[1])) with open("ml-1m/ratings.dat", 'r') as f: for line in f.readlines(): user_id, movie_id, rating, _ = line.split("::") movie_id = int(movie_id) - 1 user_id = int(user_id) - 1 user_genre_distr[user_id, :] += movie_genre[movie_id, :] loyal_count = 0 loyal_users = [] for user_id in user_ids: user_id -= 1 user = user_genre_distr[user_id, :] if loyal_percent_upper >= max(user) / sum(user) > loyal_percent_lower: loyal_count += 1 loyal_users.append(user_id + 1) #print("For threshold", loyal_percent, ",", loyal_count, "users are considered loyal") return loyal_users
def compare_real_fake(): import RealFakeData as RFD real = MD.load_user_item_matrix_1m() real = real[0:40, 0:40] # fake = load_user_item_matrix_100k() # fake = simulate_data(real.shape) fake_bm = RFD.load_user_item_matrix_1m_masked(file_index=12) fake_bm = fake_bm[0:40, 0:40] fake_bmpp = RFD.load_user_item_matrix_1m_masked(file_index=17) fake_bmpp = fake_bmpp[00:40, 0:40] print(fake_bmpp.shape) plt.subplot(3,3,1) plt.imshow(real) plt.title("real") plt.subplot(3,3,4) plt.imshow(fake_bm) plt.title("fake_bm") plt.subplot(3, 3, 7) plt.imshow(fake_bmpp) plt.title("fake_bmpp") plt.subplot(3, 3, 5) plt.imshow(real-fake_bm) plt.title("real-fake_bm") plt.subplot(3, 3, 8) plt.imshow(real - fake_bmpp) plt.title("real-fake_bmpp") plt.show()
def load_real_fake_data_ML_1m(file_index=24): data = [] real = MD.load_user_item_matrix_1m() #real = MD.load_user_item_matrix_100k() #real = load_user_item_matrix_1m_masked(file_index=17) real = real[0:int(real.shape[0] / 2), :] #fake = load_user_item_matrix_100k() #fake = simulate_data(real.shape) fake = load_user_item_matrix_1m_masked(file_index=file_index) #fake = MD.load_user_item_matrix_100k_masked(file_index=1) fake = fake[int(fake.shape[0] / 2):, :] #fake = real #fake = np.random.randint(5, size=real.shape) #print(fake) data = np.zeros(shape=(real.shape[0] + fake.shape[0], real[0].shape[0])) labels = np.zeros(shape=(real.shape[0] + fake.shape[0], )) for user_index, user in enumerate(real): data[user_index, :] = user labels[user_index] = 1 for user_index, user in enumerate(fake): data[len(real) + user_index, :] = user labels[len(real) + user_index] = 0 from Utils import shuffle_two_arrays data, labels = shuffle_two_arrays(data, labels) return data, labels
def show_correlation_genre(): genres = ["Action", "Adventure", "Animation", "Children\'s", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"] movie_genre = MD.load_movie_genre_matrix_1m() print(movie_genre.shape) cooc = np.zeros(shape=(movie_genre.shape[1], movie_genre.shape[1])) # show the simple co-occurrence matrix: for movie in movie_genre: pairs = [] for index1 in range(len(movie)): if movie[index1] == 1: for index2 in range(index1+1, len(movie)): if movie[index2] == 1: pairs.append([index1, index2]) for one, two in pairs: cooc[one, two] += 1 cooc[two, one] += 1 plt.rcParams.update({'font.size': 22}) import seaborn as sb fig, ax = plt.subplots() ax = sb.heatmap(cooc, linewidths=0.5) # We want to show all ticks... ax.set_xticks(np.arange(len(genres))) ax.set_yticks(np.arange(len(genres))) # ... and label them with the respective list entries ax.set_xticklabels(genres) ax.set_yticklabels(genres) plt.setp(ax.get_yticklabels(), rotation=45, ha="right", rotation_mode="anchor") plt.setp(ax.get_xticklabels(), rotation=-30, ha="left", rotation_mode="anchor") plt.title("Co-occurrence of movie genres in ML 1m") plt.show()
def genre_exploration_1m(): genres = [ "Action", "Adventure", "Animation", "Children\'s", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western" ] import MovieLensData as MD import matplotlib.pyplot as plt movie_genre = MD.load_movie_genre_matrix_1m() # plot genre frequencies: genre_frequency = np.sum(movie_genre, axis=0) plt.bar(genres, genre_frequency) plt.show() print(genre_frequency) # number of genres per movie: genre_count = np.sum(movie_genre, axis=1) #for index, count in enumerate(genre_count): # if count == 0: # print(index) import collections counter = collections.Counter(genre_count) print(counter) plt.bar(counter.keys(), counter.values()) plt.xlabel("#genres") plt.ylabel('frequency') plt.show()
def test_avg_rating_gender_per_movie_1m(): import MovieLensData as MD from scipy.stats import ttest_ind, mannwhitneyu gender_dict = MD.gender_user_dictionary_1m() user_item = MD.load_user_item_matrix_1m() movies = {} with open("ml-1m/movies.dat", 'r') as f: for line in f.readlines(): id, name, genre = line.replace("\n", "").split("::") movies[int(id)] = name + "::" + genre counter = 0 print(len(user_item[0])) for movie_id in range(len(user_item[0])): ratings = user_item[:, movie_id] male_ratings = [] female_ratings = [] for user_id, rating in enumerate(ratings): if rating > 0: if gender_dict[user_id] == 'M': male_ratings.append(rating) else: female_ratings.append(rating) try: _, p_value = mannwhitneyu(male_ratings, female_ratings) if p_value < 0.05 / len(user_item[0]): #print(movie_id+1, "%.2f" % np.average(male_ratings), len(male_ratings), "%.2f" % np.average(female_ratings), len(female_ratings), p_value) counter += 1 #plt.bar(["male", "female"], [np.average(male_ratings), np.average(female_ratings)]) #plt.show() if np.average(male_ratings) > np.average(female_ratings): print( str(movie_id + 1) + "::" + movies[movie_id + 1] + "::M") if np.average(male_ratings) < np.average(female_ratings): print( str(movie_id + 1) + "::" + movies[movie_id + 1] + "::F") except: print("Testing failed for", movie_id) print(str(1 + 1) + "::" + movies[1]) print(counter)
def one_hundert_k(classifier): X = MD.load_user_item_matrix_100k() # max_user=max_user, max_item=max_item) #X = MD.load_user_genre_matrix_100k() T = MD.load_gender_vector_100k() # max_user=max_user) X_train, T_train = X[0:int(0.8 * len(X))], T[0:int(0.8 * len(X))] X_test, T_test = X[int(0.8 * len(X)):], T[int(0.8 * len(X)):] # print(X) print(X_train.shape) # X = Utils.remove_significant_features(X, T) #X_train = Utils.random_forest_selection(X_train, T_train) # X = feature_selection(X, T, Utils.select_male_female_different) print(X_train.shape) # X = Utils.normalize(X) # X = Utils.standardize(X) # X = chi2_selection(X, T) classifier(X_train, T_train)
def real_vs_fake(): X, T = RFData.load_real_fake_data_ML_1m(file_index=49) #X, T = RFData.load_real_fake_data_ML_100k() #print(type(Y[0])) # Classifiers.log_reg(X, Y) X_train, T_train = X[0:int(0.8 * len(X))], T[0:int(0.8 * len(X))] X_test, T_test = X[int(0.8 * len(X)):], T[int(0.8 * len(X)):] Classifiers.log_reg(X_train, T_train) from sklearn.model_selection import StratifiedKFold from sklearn.linear_model import LogisticRegression cv = StratifiedKFold(n_splits=10) coefs = [] avg_coefs = np.zeros(shape=(len(X_train[1]),)) random_state = np.random.RandomState(0) for train, test in cv.split(X_train, T_train): x, t = X_train[train], T_train[train] model = LogisticRegression(penalty='l2', random_state=random_state) model.fit(x, t) # rank the coefs: ranks = ss.rankdata(model.coef_[0]) coefs.append(ranks) # print(len(model.coef_[0]),len(X_train[0])) avg_coefs += model.coef_[0] coefs = np.average(coefs, axis=0) coefs = [[coefs[i], i, avg_coefs[i]] for i in range(len(coefs))] coefs = np.asarray(list(sorted(coefs))) values = coefs[:, 2] index_zero = np.where(values == np.min(np.abs(values))) top_male = index_zero[0][0] top_female = index_zero[0][-1] L_m = coefs[:top_male, 1] R_m = 3952 - coefs[:top_male, 0] C_m = np.abs(coefs[:top_male, 2]) L_f = coefs[coefs.shape[0] - top_female:, 1] L_f = list(reversed(L_f)) R_f = coefs[coefs.shape[0] - top_female:, 0] R_f = list(reversed(R_f)) C_f = coefs[coefs.shape[0] - top_female:, 2] C_f = list(reversed(np.abs(C_f))) id_index, index_id = MD.load_movie_id_index_dict() movies = [] with open("ml-1m/movies.dat", 'r') as f: for line in f.readlines(): movies.append(line.replace("\n", "")) for index, val in enumerate(L_m[0:min(10,len(L_m))]): print(index, movies[id_index[int(val)+1]], C_m[index]) for index, val in enumerate(L_f[0:min(10,len(L_f))]): print(index, movies[id_index[int(val)+1]], C_f[index])