Beispiel #1
0
def one_hundert_k(classifier):
    X = MD.load_user_item_matrix_100k()  # max_user=max_user, max_item=max_item)
    #X = normalize(X)
    T = MD.load_age_vector_100k()  # max_user=max_user)
    #X = chi2_selection(X, T)

    classifier(X, T)
Beispiel #2
0
def avg_rating_diff():
    X = MD.load_user_item_matrix_100k()
    T = MD.load_gender_vector_100k()
    name_dict = MD.load_movie_id_dictionary_100k()
    males_indecies = np.argwhere(T==0)[:,0]
    females_indecies = np.argwhere(T == 1)[:, 0]
    differences = np.zeros(shape=X.shape[1],)
    for movie_index, movie in enumerate(np.transpose(X)):
        avg_m = []
        male_ratings = movie[males_indecies]
        avg_f = []
        female_ratings = movie[females_indecies]
        for m_r, f_r in zip(male_ratings, female_ratings):
            if m_r > 0:
                avg_m.append(m_r)
            if f_r > 0:
                avg_f.append(f_r)
        avg_m = np.average(avg_m)
        avg_f = np.average(avg_f)
        if not (np.isnan(avg_m) or np.isnan(avg_f)):
            differences[movie_index] = avg_m-avg_f
    differences = [[differences[index], index] for index in range(differences.shape[0])]
    differences = np.asarray(list(reversed(sorted(differences))))
    print(differences[0:20,1])
    names = [name_dict[index+1] for index in np.concatenate((differences[0:20,1], differences[-20:,1]))]
    print(names)
    fig, ax = plt.subplots()
    ax.barh(range(40), np.concatenate((differences[0:20,0], differences[-20:,0]), axis=0), align='center')
    ax.set_yticks(range(40))
    ax.set_yticklabels(names)
    ax.invert_yaxis()  # labels read top-to-bottom
    ax.set_xlabel('Difference')
    ax.set_title('Rating difference between males and females')

    plt.show()
Beispiel #3
0
def one_hundert_k(classifier):
    X = MD.load_user_item_matrix_100k(
    )  # max_user=max_user, max_item=max_item)
    T = MD.load_occupation_vector_100k()  # max_user=max_user)
    #X = MD.chi2_selection(X, T)

    classifier(X, T, multiclass=True)
Beispiel #4
0
def test_avg_rating_gender_per_movie_100k():
    import MovieLensData as MD
    from scipy.stats import ttest_ind, mannwhitneyu
    gender_vec = MD.load_gender_vector_100k()
    user_item = MD.load_user_item_matrix_100k()

    movies = {}
    with open("ml-100k/u.item", 'r') as f:
        for line in f.readlines():
            i1 = line.find("|")
            id = line[:i1]
            i2 = line.find("|", i1 + 1)
            name = line[i1 + 1:i2]
            movies[int(id)] = name
    counter = 0
    print(len(user_item[0]))
    for movie_id in range(len(user_item[0])):
        ratings = user_item[:, movie_id]
        male_ratings = []
        female_ratings = []
        for user_id, rating in enumerate(ratings):
            if rating > 0:
                if gender_vec[user_id] == 0:
                    male_ratings.append(rating)
                else:
                    female_ratings.append(rating)
        try:

            if len(male_ratings) == 0:
                male_ratings = np.array([0])
            if len(female_ratings) == 0:
                female_ratings = np.array([0])
            if np.average(male_ratings) == np.average(female_ratings):
                continue

            _, p_value = ttest_ind(male_ratings, female_ratings)
            #print(p_value)
            if p_value < (0.05 / len(user_item[0])):
                #print(movie_id+1, "%.2f" % np.average(male_ratings), len(male_ratings), "%.2f" % np.average(female_ratings), len(female_ratings), p_value)
                counter += 1
                #print(np.average(male_ratings) , np.average(female_ratings))
                #print(male_ratings, female_ratings)
                #plt.bar(["male", "female"], [np.average(male_ratings), np.average(female_ratings)])
                #plt.show()
                if np.average(male_ratings) > np.average(female_ratings):
                    print(
                        str(movie_id + 1) + "::" + movies[movie_id + 1] +
                        "::M")
                if np.average(male_ratings) < np.average(female_ratings):
                    print(
                        str(movie_id + 1) + "::" + movies[movie_id + 1] +
                        "::F")
        except:
            print(male_ratings, female_ratings)
            print("Testing failed for", movie_id)
            continue

    print("counter", counter)
Beispiel #5
0
def one_hundert_k_obfuscated(classifier):
    X1 = MD.load_user_item_matrix_100k()
    X2 = MD.load_user_item_matrix_100k_masked(file_index=1)  # max_user=max_user, max_item=max_item)
    X3 = MD.load_user_item_matrix_100k_masked(file_index=-1)

    T = MD.load_gender_vector_100k()  # max_user=max_user)
    X_train, T_train = X3[0:int(0.8 * len(X3))], T[0:int(0.8 * len(X3))]
    X_test, T_test = X1[int(0.8 * len(X1)):], T[int(0.8 * len(X1)):]

    from sklearn.linear_model import LogisticRegression
    random_state = np.random.RandomState(0)
    model = LogisticRegression(penalty='l2', random_state=random_state)
    model.fit(X_train, T_train)
    Utils.ROC_plot(X_test, T_test, model)
Beispiel #6
0
def one_hundert_k(classifier):
    X = MD.load_user_item_matrix_100k()  # max_user=max_user, max_item=max_item)
    #X = MD.load_user_genre_matrix_100k()
    T = MD.load_gender_vector_100k()  # max_user=max_user)
    X_train, T_train = X[0:int(0.8 * len(X))], T[0:int(0.8 * len(X))]
    X_test, T_test = X[int(0.8 * len(X)):], T[int(0.8 * len(X)):]

    # print(X)
    print(X_train.shape)
    # X = Utils.remove_significant_features(X, T)
    #X_train = Utils.random_forest_selection(X_train, T_train)
    # X = feature_selection(X, T, Utils.select_male_female_different)
    print(X_train.shape)

    # X = Utils.normalize(X)
    # X = Utils.standardize(X)
    # X = chi2_selection(X, T)

    classifier(X_train, T_train)
Beispiel #7
0
def feature_importance_100k():
    from sklearn.ensemble import ExtraTreesClassifier
    X = MD.load_user_item_matrix_100k()
    T = MD.load_gender_vector_100k()
    importance = np.zeros(shape=(X.shape[1],))
    for i in range(100):
        model = ExtraTreesClassifier()
        model.fit(X, T)
        importance += model.feature_importances_
    importance /= 100
    plt.plot(range(len(importance)), importance)
    plt.xlabel("movie index")
    plt.ylabel("importance")
    plt.show()
    counter = 0
    for movie, score in enumerate(importance):
        if score >= 0.002:
            print(movie + 1, end=",")
            counter += 1
    print()
    print(counter)
Beispiel #8
0
def PCA_100k():
    X = MD.load_user_item_matrix_100k()
    T = MD.load_gender_vector_100k()
    males = X[np.argwhere(T==0)[:,0]]
    females = X[np.argwhere(T==1)[:,0]]
    print(females.shape)
    from sklearn.decomposition import PCA
    pca = PCA(n_components=2)
    PC = pca.fit_transform(males)
    for x, y in PC:
        plt.scatter(x, y, c='b')
    PC = pca.fit_transform(females)
    for x, y in PC:
        plt.scatter(x, y, c='r')

    plt.show()

    for index, (x,y) in enumerate(PC):
        if T[index] == 0:
            plt.scatter(x, y, c='b')
        else:
            plt.scatter(x, y, c='r')
    plt.show()
    print(PC)
Beispiel #9
0
def blurMe_100k():
    sample_mode = list(['random', 'sampled', 'greedy'])[2]
    rating_mode = list(['highest', 'avg', 'pred'])[1]

    # 1: get the set of most correlated movies, L_f and L_m:
    X = MD.load_user_item_matrix_100k()  # max_user=max_user, max_item=max_item)
    avg_ratings = np.zeros(shape=X.shape[1])
    for item_id in range(X.shape[1]):
        ratings = []
        for rating in X[:, item_id]:
            if rating > 0:
                ratings.append(rating)
        if len(ratings) == 0:
            avg_ratings[item_id] = 0
        else:
            avg_ratings[item_id] = np.average(ratings)

    T = MD.load_gender_vector_100k()  # max_user=max_user)
    X_train, T_train = X[0:int(0.8 * len(X))], T[0:int(0.8 * len(X))]
    X_test, T_test = X[int(0.8 * len(X)):], T[int(0.8 * len(X)):]

    from sklearn.model_selection import StratifiedKFold
    from sklearn.linear_model import LogisticRegression

    cv = StratifiedKFold(n_splits=10)
    coefs = []

    for train, test in cv.split(X_train, T_train):
        x, t = X_train[train], T_train[train]
        random_state = np.random.RandomState(0)
        model = LogisticRegression(penalty='l2', random_state=random_state)
        model.fit(x, t)
        coefs.append(model.coef_)

    coefs = np.average(coefs, axis=0)[0]
    coefs = [[coefs[i], i + 1] for i in range(len(coefs))]
    coefs = np.asarray(list(sorted(coefs)))
    L_m = coefs[:10, 1]
    L_f = coefs[coefs.shape[0] - 10:, 1]
    L_f = list(reversed(L_f))
    """
    movie_dict = MD.load_movie_id_dictionary_1m()
    print("males")
    for id in L_m:
        print(movie_dict[int(id)])

    print("females")
    for id in L_f:
        print(movie_dict[int(id)])
    """

    # Now, where we have the two lists, we can start obfuscating the data:
    X = MD.load_user_item_matrix_100k()
    X_obf = MD.load_user_item_matrix_100k()
    p = 0.1
    prob_m = [p / sum(L_m) for p in L_m]
    prob_f = [p / sum(L_f) for p in L_f]
    for index, user in enumerate(X):
        k = 0
        for rating in user:
            if rating > 0:
                k += 1
        k *= p
        greedy_index = 0
        #print(k)
        if T[index] == 1:
            added = 0
            safety_counter = 0
            while added < k and safety_counter < 100:
                # select a random movie:
                if sample_mode == 'random':
                    movie_id = L_m[np.random.randint(0, len(L_m))]
                elif sample_mode == 'sampled':
                    movie_id = L_m[np.random.choice(range(len(L_m)), p=prob_m)]
                elif sample_mode == 'greedy':
                    movie_id = L_m[greedy_index]
                    greedy_index += 1
                    if greedy_index >= len(L_m):
                        safety_counter = 100
                if X_obf[index, int(movie_id)-1] == 0:
                    if rating_mode == 'higest':
                        X_obf[index, int(movie_id) - 1] = 5
                    elif rating_mode == 'avg':
                        X_obf[index, int(movie_id) - 1] = avg_ratings[int(movie_id)]
                    added += 1
                safety_counter += 1
        elif T[index] == 0:
            added = 0
            safety_counter = 0
            while added < k and safety_counter < 100:
                # select a random movie:
                if sample_mode == 'random':
                    movie_id = L_f[np.random.randint(0, len(L_f))]
                elif sample_mode == 'sampled':
                    movie_id = L_f[np.random.choice(range(len(L_f)), p=prob_f)]
                elif sample_mode == 'greedy':
                    movie_id = L_f[greedy_index]
                    greedy_index += 1
                    if greedy_index >= len(L_f):
                        safety_counter = 100
                if X_obf[index, int(movie_id) - 1] == 0:
                    if rating_mode == 'higest':
                        X_obf[index, int(movie_id) - 1] = 5
                    elif rating_mode == 'avg':
                        X_obf[index, int(movie_id) - 1] = avg_ratings[int(movie_id)]
                    added += 1
                safety_counter += 1

    # output the data in a file:
    with open("ml-100k/blurme_obfuscated_" + str(p) + "_" + sample_mode + "_" + rating_mode + ".csv", 'w') as f:
        f.write("user_id,item_id,rating")
        for index_user, user in enumerate(X_obf):
            for index_movie, rating in enumerate(user):
                if rating > 0:
                    f.write(str(index_user + 1) + "," + str(index_movie + 1) + "," + str(int(rating)) + "\n")
    return X_obf