Example #1
0
def flixster_obfuscated(classifier):
    import FlixsterData as FD
    X1, T, _ = FD.load_flixster_data_subset()
    X2, _, _ = FD.load_flixster_data_subset_masked(
        file_index=15)  # max_user=max_user, max_item=max_item)
    # X2 = X1
    print(X1.shape, X2.shape)

    # X1, T = Utils.balance_data(X1, T)
    # X2, T2 = Utils.balance_data(X2, T)
    # X1 = Utils.normalize(X1)
    # X2 = Utils.normalize(X2)
    X_train, T_train = X1[0:int(0.8 * len(X1))], T[0:int(0.8 * len(X1))]
    X_test, T_test = X2[int(0.8 * len(X2)):], T[int(0.8 * len(X2)):]
    print(list(X1[0, :]))
    print(list(X2[0, :]))
    # print(X)
    print("before", X_train.shape)
    # X = Utils.remove_significant_features(X, T)
    # X_train, _ = Utils.random_forest_selection(X_train, T_train)
    # X = feature_selection(X, T, Utils.select_male_female_different)
    print(X_train.shape)
    from sklearn.linear_model import LogisticRegression
    random_state = np.random.RandomState(0)
    model = LogisticRegression(penalty='l2', random_state=random_state)

    Utils.ROC_cv_obf(X1, X2, T, model)

    model = LogisticRegression(penalty='l2', random_state=random_state)
Example #2
0
def flixster(classifier):
    import FlixsterData as FD
    X, T = FD.load_flixster_data_subset(file="Flixster/subset_1000.txt")
    #X = Utils.normalize(X)
    X_train, T_train = X[0:int(0.8 * len(X))], T[0:int(0.8 * len(X))]
    X_test, T_test = X[int(0.8 * len(X)):], T[int(0.8 * len(X)):]

    # print(X)
    print("before", X_train.shape)
    # X = Utils.remove_significant_features(X, T)
    # X_train, _ = Utils.random_forest_selection(X_train, T_train)
    # X = feature_selection(X, T, Utils.select_male_female_different)
    print(X_train.shape)

    # X = Utils.normalize(X)
    # X = Utils.standardize(X)
    # X = chi2_selection(X, T)

    classifier(X_train, T_train)
    from sklearn.linear_model import LogisticRegression
    random_state = np.random.RandomState(0)
    # model = Models.Dominant_Class_Classifier()
    model = LogisticRegression(penalty='l2', random_state=random_state)
    model.fit(X_train, T_train)
    Utils.ROC_plot(X_test, T_test, model)
Example #3
0
def stats_of_flixster():
    import FlixsterData as FD
    id2index, index2id = FD.load_user_id_index_dict()
    #print("number of users in data set:", len(index2id))
    user_gender = FD.load_user_gender()
    gender_vector = [user_gender[key] for key in user_gender]
    #print("possible values for gender", set(gender_vector))
    import collections
    counter = collections.Counter(gender_vector)
    #print("Distribution of values", counter)
    #print("making", counter['Male']+counter['Female'], "valid users")
    print("valid user profiles:", len(id2index))
    movies = FD.load_movies()
    print("number of movies", len(movies.keys()))

    interactions = 0
    nr_ratings = np.zeros(shape=(935267,))
    ratings = []
    with open("Flixster/ratings.txt", 'r', encoding='utf-16') as f:
        for line in f.readlines()[1:]:
            if len(line) < 2:
                continue
            else:
                user_id, _, rating, _ = line.split("\t")
                if user_id in id2index:
                    #if id2index[user_id] < 100:
                    nr_ratings[id2index[user_id]] += 1
                    ratings.append(float(rating))
                interactions += 1
    print("number of interactions", interactions)
    print("ratings avg", np.average(ratings), np.var(ratings))
    counter = 0
    for rating in nr_ratings:
        if rating > 0:
            counter += 1
    print(counter)

    print("on subset of 400 users:")
    X, T = FD.load_flixster_data()
    interactions = 0
    for user in X:
        for rating in user:
            if rating > 0:
                interactions += 1
    print("interactions:", interactions)
    counter = collections.Counter(T)
    print(counter)
Example #4
0
def load_real_fake_data_flixster(file_index=-1):
    import FlixsterData as FD
    real, _, valid_movies = FD.load_flixster_data_subset(small=True)

    real = real[0:int(real.shape[0] / 2), :]
    fake = FD.load_flixster_data_subset_masked(file_index=file_index,
                                               small=True,
                                               valid_movies=valid_movies)[0]
    fake = fake[int(fake.shape[0] / 2):, :]

    data = np.zeros(shape=(real.shape[0] + fake.shape[0], real[0].shape[0]))
    labels = np.zeros(shape=(real.shape[0] + fake.shape[0], ))
    for user_index, user in enumerate(real):
        data[user_index, :] = user
        labels[user_index] = 1
    for user_index, user in enumerate(fake):
        data[len(real) + user_index, :] = user
        labels[len(real) + user_index] = 0

    from Utils import shuffle_two_arrays
    data, labels = shuffle_two_arrays(data, labels)
    return data, labels
Example #5
0
def blurMoreAgain():
    import RealFakeData as RF
    top = -1
    sample_mode = list(['random', 'sampled', 'greedy'])[2]
    id_index, index_id = MD.load_movie_id_index_dict()
    notice_factor = 2

    p = 0.05
    # 23 for blurmore 10% greedy; 17 for BlurMe 10% greedy
    file = 17
    dataset = ['ML', 'Fx', 'Li'][0]
    if dataset == 'ML':
        BlurMoreData = MD.load_user_item_matrix_1m_masked(file_index=71)  # max_user=max_user, max_item=max_item)
        BlurMoreLabels = MD.load_gender_vector_1m()  # max_user=max_user)
        X, T = RF.load_real_fake_data_ML_1m(file_index=file)
    elif dataset == 'Fx':
        import FlixsterData as FD
        X, T = FD.load_flixster_data_subset()
    else:
        X, T = 0, 0
    # X = Utils.normalize(X)
    avg_ratings = np.zeros(shape=BlurMoreData.shape[1])
    initial_count = np.zeros(shape=BlurMoreData.shape[1])
    for item_id in range(BlurMoreData.shape[1]):
        ratings = []
        for rating in BlurMoreData[:, item_id]:
            if rating > 0:
                ratings.append(rating)
        if len(ratings) == 0:
            avg_ratings[item_id] = 0
        else:
            avg_ratings[item_id] = np.average(ratings)
        initial_count[item_id] = len(ratings)
    max_count = initial_count * notice_factor
    # 1: get the set of most correlated movies, L_f and L_m:
    #T = MD.load_gender_vector_1m()  # max_user=max_user)
    #X_train, T_train = X[0:int(0.8 * len(X))], T[0:int(0.8 * len(X))]
    #X_test, T_test = X[int(0.8 * len(X)):], T[int(0.8 * len(X)):]

    from sklearn.model_selection import StratifiedKFold
    from sklearn.linear_model import LogisticRegression

    cv = StratifiedKFold(n_splits=10)
    coefs = []
    avg_coefs = np.zeros(shape=(len(X[1]),))

    random_state = np.random.RandomState(0)
    for train, test in cv.split(X, T):
        x, t = X[train], T[train]
        model = LogisticRegression(penalty='l2', random_state=random_state)
        model.fit(x, t)
        # rank the coefs:
        ranks = ss.rankdata(model.coef_[0])
        coefs.append(ranks)
        # print(len(model.coef_[0]),len(X_train[0]))
        avg_coefs += model.coef_[0]

    coefs = np.average(coefs, axis=0)
    coefs = [[coefs[i], i + 1, avg_coefs[i]] for i in range(len(coefs))]
    coefs = np.asarray(list(sorted(coefs)))
    if top == -1:
        values = coefs[:, 2]
        index_zero = np.where(values == np.min(np.abs(values)))
        top_male = index_zero[0][0]
        top_female = index_zero[0][-1]
        L_m = coefs[:top_male, 1]
        R_m = 3952 - coefs[:top_male, 0]
        C_m = np.abs(coefs[:top_male, 2])
        L_f = coefs[coefs.shape[0] - top_female:, 1]
        L_f = list(reversed(L_f))
        R_f = coefs[coefs.shape[0] - top_female:, 0]
        R_f = list(reversed(R_f))
        C_f = coefs[coefs.shape[0] - top_female:, 2]
        C_f = list(reversed(np.abs(C_f)))

    else:
        L_m = coefs[:top, 1]
        R_m = 3952 - coefs[:top, 0]
        C_m = np.abs(coefs[:top, 2])
        L_f = coefs[coefs.shape[0] - top:, 1]
        L_f = list(reversed(L_f))
        R_f = coefs[coefs.shape[0] - top:, 0]
        R_f = list(reversed(R_f))
        C_f = coefs[coefs.shape[0] - top:, 2]
        C_f = list(reversed(np.abs(C_f)))

    id_index, index_id = MD.load_movie_id_index_dict()
    movies = []
    with open("ml-1m/movies.dat", 'r') as f:
        for line in f.readlines():
            movies.append(line.replace("\n", ""))

    for index, val in enumerate(L_m[0:10]):
        print(index, movies[id_index[int(val)]], C_m[index])
    for index, val in enumerate(L_f[0:10]):
        print(index, movies[id_index[int(val)]], C_f[index])

    # Now, where we have the two lists, we can start obfuscating the data:
    #X = MD.load_user_item_matrix_1m()
    # np.random.shuffle(X)
    X_obf = np.copy(BlurMoreData)
    total_added = 0
    for index, user in enumerate(BlurMoreData):
        print(index)
        k = 0
        for rating in user:
            if rating > 0:
                k += 1
        k *= p
        greedy_index = 0
        # print(k)
        added = 0

        safety_counter = 0
        while added < k and safety_counter < 1000:
            if greedy_index >= len(L_f):
                safety_counter = 1000
                continue
            if sample_mode == 'greedy':
                movie_id = L_f[greedy_index]
            if sample_mode == 'random':
                movie_id = L_f[np.random.randint(0, len(L_f))]
            greedy_index += 1
            rating_count = sum([1 if x > 0 else 0 for x in X_obf[:, int(movie_id) - 1]])
            if rating_count > max_count[int(movie_id) - 1]:
                continue

            if X_obf[index, int(movie_id) - 1] == 0:
                X_obf[index, int(movie_id) - 1] = avg_ratings[int(movie_id) - 1]
                added += 1
            safety_counter += 1
        total_added += added

    # Now remove ratings from users for movies from the list of movies that corr with obfuscation

    for index, user in enumerate(BlurMoreData):
        print(index)
        k = 0
        for rating in user:
            if rating > 0:
                k += 1
        k *= p
        greedy_index = 0
        # print(k)
        added = 0

        safety_counter = 0
        while added < k and safety_counter < 1000:
            if greedy_index >= len(L_m):
                safety_counter = 1000
                continue
            if sample_mode == 'greedy':
                movie_id = L_m[greedy_index]
            if sample_mode == 'random':
                movie_id = L_m[np.random.randint(0, len(L_m))]
            greedy_index += 1

            if X_obf[index, int(movie_id) - 1] != 0:
                X_obf[index, int(movie_id) - 1] = 0
                added += 1
            safety_counter += 1
        total_added -= added

    output_file = ""
    if dataset == 'ML':
        output_file = "ml-1m/"
        with open(output_file + "blurme_x2_obfuscated_" + sample_mode + "_" + str(p) + "_" + str(
                notice_factor) + ".dat", 'w') as f:
            for index_user, user in enumerate(X_obf):
                for index_movie, rating in enumerate(user):
                    if rating > 0:
                        f.write(
                            str(index_user + 1) + "::" + str(index_movie + 1) + "::" + str(
                                int(np.round(rating))) + "::000000000\n")
    elif dataset == 'Fx':
        output_file = "Flixster/"
    else:
        output_file = "whatever/"


    return X_obf
Example #6
0
def blurMe_1m():
    sample_mode = list(['random', 'sampled', 'greedy'])[2]
    rating_mode = list(['highest', 'avg', 'pred'])[1]
    top = -1
    p = 0.01
    dataset = ['ML', 'Fx', 'Li'][0]
    if dataset == 'ML':
        X = MD.load_user_item_matrix_1m()  # max_user=max_user, max_item=max_item)
        T = MD.load_gender_vector_1m()  # max_user=max_user)
    elif dataset == 'Fx':
        import FlixsterData as FD
        X, T, _ = FD.load_flixster_data_subset()
    else:
        import LibimSeTiData as LD
        X, T, _ = LD.load_libimseti_data_subset()
    #X = Utils.normalize(X)

    avg_ratings = np.zeros(shape=X.shape[0])
    for index, user in enumerate(X):
        ratings = []
        for rating in user:
            if rating > 0:
                ratings.append(rating)
        if len(ratings) == 0:
            avg_ratings[index] = 0
        else:
            avg_ratings[index] = np.average(ratings)

    """ AVERAGE ACROSS MOVIE
    avg_ratings = np.zeros(shape=X.shape[1])
    for item_id in range(X.shape[1]):
        ratings = []
        for rating in X[:, item_id]:
            if rating > 0:
                ratings.append(rating)
        if len(ratings) == 0:
            avg_ratings[item_id] = 0
        else:
            avg_ratings[item_id] = np.average(ratings)
    """


    # 1: get the set of most correlated movies, L_f and L_m:
    X_train, T_train = X[0:int(0.8 * len(X))], T[0:int(0.8 * len(X))]
    X_test, T_test = X[int(0.8 * len(X)):], T[int(0.8 * len(X)):]
    print("lists")
    from sklearn.model_selection import StratifiedKFold
    from sklearn.linear_model import LogisticRegression

    cv = StratifiedKFold(n_splits=10)
    coefs = []
    avg_coefs = np.zeros(shape=(len(X_train[1]),))

    random_state = np.random.RandomState(0)
    for train, test in cv.split(X_train, T_train):
        x, t = X_train[train], T_train[train]
        model = LogisticRegression(penalty='l2', random_state=random_state)
        model.fit(x, t)
        # rank the coefs:
        ranks = ss.rankdata(model.coef_[0])
        coefs.append(ranks)
        #print(len(model.coef_[0]),len(X_train[0]))
        avg_coefs += model.coef_[0]

    coefs = np.average(coefs, axis=0)
    coefs = [[coefs[i], i+1, avg_coefs[i]] for i in range(len(coefs))]
    coefs = np.asarray(list(sorted(coefs)))
    if top == -1:
        values = coefs[:,2]
        index_zero = np.where(values == np.min(np.abs(values)))
        top_male = index_zero[0][0]
        top_female = index_zero[0][-1]
        L_m = coefs[:top_male, 1]
        R_m = 3952 - coefs[:top_male, 0]
        C_m = np.abs(coefs[:top_male, 2])
        L_f = coefs[coefs.shape[0] - top_female:, 1]
        L_f = list(reversed(L_f))
        R_f = coefs[coefs.shape[0] - top_female:, 0]
        R_f = list(reversed(R_f))
        C_f = coefs[coefs.shape[0] - top_female:, 2]
        C_f = list(reversed(np.abs(C_f)))

    else:
        L_m = coefs[:top, 1]
        R_m = 3952-coefs[:top, 0]
        C_m = np.abs(coefs[:top, 2])
        L_f = coefs[coefs.shape[0]-top:, 1]
        L_f = list(reversed(L_f))
        R_f = coefs[coefs.shape[0]-top:, 0]
        R_f = list(reversed(R_f))
        C_f = coefs[coefs.shape[0]-top:, 2]
        C_f = list(reversed(np.abs(C_f)))

    #print(R_f)

    """
    id_index, index_id = MD.load_movie_id_index_dict()
    movies = []
    with open("ml-1m/movies.dat", 'r') as f:
        for line in f.readlines():
            movies.append(line.replace("\n", ""))

    for index, val in enumerate(L_m[0:10]):
        print(index, movies[id_index[int(val)]], C_m[index])
    for index, val in enumerate(L_f[0:10]):
        print(index, movies[id_index[int(val)]], C_f[index])

    
    movie_dict = MD.load_movie_id_dictionary_1m()
    print("males")
    for id in L_m:
        print(movie_dict[int(id)])

    print("females")
    for id in L_f:
        print(movie_dict[int(id)])
    """
    print("obfuscation")
    # Now, where we have the two lists, we can start obfuscating the data:
    #X = MD.load_user_item_matrix_1m()
    X_obf = np.copy(X)

    #X = Utils.normalize(X)
    #X_obf = Utils.normalize(X_obf)
    prob_m = []#[p / sum(C_m) for p in C_m]
    prob_f = []#[p / sum(C_f) for p in C_f]
    print("obfuscation")
    for index, user in enumerate(X):
        print(index)
        k = 0
        for rating in user:
            if rating > 0:
                k += 1
        k *= p
        greedy_index = 0
        #print(k)
        if T[index] == 1:
            added = 0
            safety_counter = 0
            while added < k and safety_counter < 100:
                # select a random movie:
                if sample_mode == 'random':
                    movie_id = L_m[np.random.randint(0, len(L_m))]
                elif sample_mode == 'sampled':
                    movie_id = L_m[np.random.choice(range(len(L_m)), p=prob_m)]
                elif sample_mode == 'greedy':
                    movie_id = L_m[greedy_index]
                    greedy_index += 1
                    if greedy_index >= len(L_m):
                        safety_counter = 100
                if X_obf[index, int(movie_id)-1] == 0:
                    if rating_mode == 'higest':
                        X_obf[index, int(movie_id) - 1] = 5
                    elif rating_mode == 'avg':
                        X_obf[index, int(movie_id) - 1] = avg_ratings[int(index)]
                    added += 1
                safety_counter += 1
        elif T[index] == 0:
            added = 0
            safety_counter = 0
            while added < k and safety_counter < 100:
                # select a random movie:
                if sample_mode == 'random':
                    movie_id = L_f[np.random.randint(0, len(L_f))]
                elif sample_mode == 'sampled':
                    movie_id = L_f[np.random.choice(range(len(L_f)), p=prob_f)]
                elif sample_mode == 'greedy':
                    movie_id = L_f[greedy_index]
                    greedy_index += 1
                    if greedy_index >= len(L_f):
                        safety_counter = 100
                if X_obf[index, int(movie_id) - 1] == 0:
                    if rating_mode == 'higest':
                        X_obf[index, int(movie_id) - 1] = 5
                    elif rating_mode == 'avg':
                        X_obf[index, int(movie_id) - 1] = avg_ratings[int(index)]
                    added += 1
                safety_counter += 1

    # output the data in a file:
    output_file = ""
    if dataset == 'ML':
        output_file = "ml-1m/"
        with open(output_file + "blurme_obfuscated_" + str(p) + "_" + sample_mode + "_" + rating_mode + "_top" + str(
                top) + ".dat", 'w') as f:
            for index_user, user in enumerate(X_obf):
                for index_movie, rating in enumerate(user):
                    if rating > 0:
                        f.write(str(index_user + 1) + "::" + str(index_movie + 1) + "::" + str(
                            int(np.round(rating))) + "::000000000\n")

    elif dataset == 'Fx':
        import FlixsterData as FD
        output_file = "Flixster/"
        user_id2index, user_index2id = FD.load_user_id_index_dict()
        movie_id2index, movie_index2id = FD.load_movie_id_index_dict()

        with open(output_file + "FX_blurme_obfuscated_" + str(p) + "_" + sample_mode + "_" + rating_mode + "_top" + str(
                top) + ".dat", 'w') as f:
            for index_user, user in enumerate(X_obf):
                for index_movie, rating in enumerate(user):
                    if rating > 0:
                        f.write(str(user_index2id[index_user]) + "::" + str(movie_index2id[index_movie]) + "::" + str(
                            int(np.round(rating))) + "::000000000\n")

    else:
        with open("libimseti/LST_blurme_obfuscated_" + str(p) + "_" + sample_mode + "_" + rating_mode + "_top" + str(
                top) + ".dat", 'w') as f:
            for index_user, user in enumerate(X_obf):
                for index_movie, rating in enumerate(user):
                    if rating > 0:
                        f.write(str(index_user+1) + "::" + str(index_movie+1) + "::" + str(
                            int(np.round(rating))) + "::000000000\n")

    return X_obf
Example #7
0
def blurMePP():
    top = -1
    sample_mode = list(['random', 'sampled', 'greedy'])[2]
    id_index, index_id = MD.load_movie_id_index_dict()
    notice_factor = 2
    p = 0.1
    dataset = ['ML', 'Fx', 'Li'][2]
    if dataset == 'ML':
        X = MD.load_user_item_matrix_1m()  # max_user=max_user, max_item=max_item)
        T = MD.load_gender_vector_1m()  # max_user=max_user)
    elif dataset == 'Fx':
        import FlixsterData as FD
        X, T, _ = FD.load_flixster_data_subset()
    else:
        import LibimSeTiData as LD
        X, T, _ = LD.load_libimseti_data_subset()
    # X = Utils.normalize(X)
    avg_ratings = np.zeros(shape=X.shape[1])
    initial_count = np.zeros(shape=X.shape[1])
    for item_id in range(X.shape[1]):
        ratings = []
        for rating in X[:, item_id]:
            if rating > 0:
                ratings.append(rating)
        if len(ratings) == 0:
            avg_ratings[item_id] = 0
        else:
            avg_ratings[item_id] = np.average(ratings)
        initial_count[item_id] = len(ratings)
    max_count = initial_count * notice_factor
    # 1: get the set of most correlated movies, L_f and L_m:
    from sklearn.model_selection import StratifiedKFold
    from sklearn.linear_model import LogisticRegression

    cv = StratifiedKFold(n_splits=10)
    coefs = []
    avg_coefs = np.zeros(shape=(len(X[1]),))

    random_state = np.random.RandomState(0)
    for train, test in cv.split(X, T):
        x, t = X[train], T[train]
        model = LogisticRegression(penalty='l2', random_state=random_state)
        model.fit(x, t)
        # rank the coefs:
        ranks = ss.rankdata(model.coef_[0])
        coefs.append(ranks)
        # print(len(model.coef_[0]),len(X_train[0]))
        avg_coefs += model.coef_[0]

    coefs = np.average(coefs, axis=0)
    coefs = [[coefs[i], i + 1, avg_coefs[i]] for i in range(len(coefs))]
    coefs = np.asarray(list(sorted(coefs)))
    if top == -1:
        values = coefs[:,2]
        index_zero = np.where(values == np.min(np.abs(values)))
        top_male = index_zero[0][0]
        top_female = index_zero[0][-1]
        L_m = coefs[:top_male, 1]
        R_m = 3952 - coefs[:top_male, 0]
        C_m = np.abs(coefs[:top_male, 2])
        L_f = coefs[coefs.shape[0] - top_female:, 1]
        L_f = list(reversed(L_f))
        R_f = coefs[coefs.shape[0] - top_female:, 0]
        R_f = list(reversed(R_f))
        C_f = coefs[coefs.shape[0] - top_female:, 2]
        C_f = list(reversed(np.abs(C_f)))

    else:
        L_m = coefs[:top, 1]
        R_m = 3952-coefs[:top, 0]
        C_m = np.abs(coefs[:top, 2])
        L_f = coefs[coefs.shape[0]-top:, 1]
        L_f = list(reversed(L_f))
        R_f = coefs[coefs.shape[0]-top:, 0]
        R_f = list(reversed(R_f))
        C_f = coefs[coefs.shape[0]-top:, 2]
        C_f = list(reversed(np.abs(C_f)))

    # Now, where we have the two lists, we can start obfuscating the data:
    #X = MD.load_user_item_matrix_1m()
    #np.random.shuffle(X)
    #print(X.shape)
    X_obf = np.copy(X)
    total_added = 0
    for index, user in enumerate(X):
        print(index)
        k = 0
        for rating in user:
            if rating > 0:
                k += 1
        k *= p
        greedy_index_m = 0
        greedy_index_f = 0
        # print(k)
        added = 0
        if T[index] == 1:
            safety_counter = 0
            while added < k and safety_counter < 1000:
                if greedy_index_m >= len(L_m):
                    safety_counter = 1000
                    continue
                if sample_mode == 'greedy':
                    movie_id = L_m[greedy_index_m]
                if sample_mode == 'random':
                    movie_id = L_m[np.random.randint(0, len(L_m))]
                greedy_index_m += 1
                rating_count = sum([1 if x > 0 else 0 for x in X_obf[:, int(movie_id)-1]])
                if rating_count > max_count[int(movie_id)-1]:
                    continue
                if X_obf[index, int(movie_id) - 1] == 0:
                    X_obf[index, int(movie_id) - 1] = avg_ratings[int(movie_id) - 1]
                    added += 1
                safety_counter += 1
        elif T[index] == 0:
            safety_counter = 0
            while added < k and safety_counter < 1000:
                if greedy_index_f >= len(L_f):
                    safety_counter = 1000
                    continue
                if sample_mode == 'greedy':
                    movie_id = L_f[greedy_index_f]
                if sample_mode == 'random':
                    movie_id = L_f[np.random.randint(0, len(L_f))]
                greedy_index_f += 1
                rating_count = sum([1 if x > 0 else 0 for x in X_obf[:, int(movie_id) - 1]])
                if rating_count > max_count[int(movie_id) - 1]:
                    continue

                if X_obf[index, int(movie_id) - 1] == 0:
                    X_obf[index, int(movie_id) - 1] = avg_ratings[int(movie_id) - 1]
                    added += 1
                safety_counter += 1
        total_added += added

    # Now remove ratings from users that have more than 200 ratings equally:
    nr_many_ratings = 0
    for user in X:
        rating_count = sum([1 if x > 0 else 0 for x in user])
        if rating_count > 200:
            nr_many_ratings += 1
    print(nr_many_ratings)
    nr_remove = total_added/nr_many_ratings

    for user_index, user in enumerate(X):
        rating_count = sum([1 if x > 0 else 0 for x in user])
        if rating_count > 200:
            to_be_removed_indecies = np.random.choice(np.argwhere(user > 0)[:,0], size=(int(nr_remove),), replace=False)
            X_obf[user_index, to_be_removed_indecies] = 0

    # finally, shuffle the user vectors:
    #np.random.shuffle(X_obf)
    # output the data in a file:
    output_file = ""
    if dataset == 'ML':
        output_file = "ml-1m/"
        with open(output_file + "blurmepp_obfuscated_" + sample_mode + "_" + str(p) + "_" + str(notice_factor) + ".dat",
                  'w') as f:
            for index_user, user in enumerate(X_obf):
                for index_movie, rating in enumerate(user):
                    if rating > 0:
                        f.write(
                            str(index_user + 1) + "::" + str(index_movie + 1) + "::" + str(
                                int(np.round(rating))) + "::000000000\n")

    elif dataset == 'Fx':
        import FlixsterData as FD
        output_file = "Flixster/"
        user_id2index, user_index2id = FD.load_user_id_index_dict()
        movie_id2index, movie_index2id = FD.load_movie_id_index_dict()

        with open(output_file + "FX_blurmepp_obfuscated_" + sample_mode + "_" + str(p) + "_" + str(notice_factor) + ".dat",
                  'w') as f:
            for index_user, user in enumerate(X_obf):
                for index_movie, rating in enumerate(user):
                    if rating > 0:
                        f.write(str(user_index2id[index_user]) + "::" + str(movie_index2id[index_movie]) + "::" + str(
                            int(np.round(rating))) + "::000000000\n")

    else:
        with open("libimseti/LST_blurmepp_obfuscated_" + sample_mode + "_" + str(p) + "_" + str(notice_factor) + ".dat", 'w') as f:
            for index_user, user in enumerate(X_obf):
                for index_movie, rating in enumerate(user):
                    if rating > 0:
                        f.write(str(index_user+1) + "::" + str(index_movie+1) + "::" + str(
                            int(np.round(rating))) + "::000000000\n")


    return X_obf
Example #8
0
def blurMeBetter():
    top = -1
    sample_mode = list(['random', 'sampled', 'greedy'])[2]
    p = 0.05
    id_index, index_id = MD.load_movie_id_index_dict()
    notice_factor = 2
    certainty_threshold = 0.8
    dataset = ['ML', 'Fx', 'Li'][0]
    if dataset == 'ML':
        X = MD.load_user_item_matrix_1m()  # max_user=max_user, max_item=max_item)
        T = MD.load_gender_vector_1m()  # max_user=max_user)
    elif dataset == 'Fx':
        import FlixsterData as FD
        X, T, _ = FD.load_flixster_data_subset()
    else:
        import LibimSeTiData as LD
        X, T, _ = LD.load_libimseti_data_subset()
    # X = Utils.normalize(X)
    avg_ratings = np.zeros(shape=X.shape[1])
    initial_count = np.zeros(shape=X.shape[1])
    for item_id in range(X.shape[1]):
        ratings = []
        for rating in X[:, item_id]:
            if rating > 0:
                ratings.append(rating)
        if len(ratings) == 0:
            avg_ratings[item_id] = 0
        else:
            avg_ratings[item_id] = np.average(ratings)
        initial_count[item_id] = len(ratings)
    max_count = initial_count * notice_factor
    # 1: get the set of most correlated movies, L_f and L_m:
    #X_train, T_train = X[0:int(0.8 * len(X))], T[0:int(0.8 * len(X))]
    #X_test, T_test = X[int(0.8 * len(X)):], T[int(0.8 * len(X)):]

    from sklearn.model_selection import StratifiedKFold
    from sklearn.linear_model import LogisticRegression

    cv = StratifiedKFold(n_splits=10)
    coefs = []
    avg_coefs = np.zeros(shape=(len(X[1]),))

    certainty = np.zeros(shape=(len(X),))
    random_state = np.random.RandomState(0)
    for train, test in cv.split(X, T):
        x, t = X[train], T[train]
        model = LogisticRegression(penalty='l2', random_state=random_state)
        model.fit(x, t)
        # rank the coefs:
        ranks = ss.rankdata(model.coef_[0])
        coefs.append(ranks)
        # print(len(model.coef_[0]),len(X_train[0]))
        avg_coefs += model.coef_[0]
        x_test = X[test]

        class_prob = np.max(model.predict_proba(x_test),axis=1)
        #correct, so that 1 means the classifier is very sure and 0 means it is not sure
        class_prob -= 0.5
        class_prob *= 2
        certainty[test] = class_prob
        # set certainty to 0 for all missclassifications:
        t_pred = model.predict(x_test)
        t_test = T[test]
        for index, (pred, target) in enumerate(zip(t_pred, t_test)):
            #print(pred, target, index, test)
            if pred != target:
                certainty[test[index]] = 0

    """ plot certainty scores
    print("-------------------------")
    import matplotlib.pyplot as plt
    plt.bar(range(0,50), certainty[0:50])
    plt.xlabel("user")
    plt.ylabel("certainty score")
    plt.show()
    """
    coefs = np.average(coefs, axis=0)
    coefs = [[coefs[i], i + 1, avg_coefs[i]] for i in range(len(coefs))]
    coefs = np.asarray(list(sorted(coefs)))
    if top == -1:
        values = coefs[:, 2]
        index_zero = np.where(values == np.min(np.abs(values)))
        top_male = index_zero[0][0]
        top_female = index_zero[0][-1]
        L_m = coefs[:top_male, 1]
        R_m = 3952 - coefs[:top_male, 0]
        C_m = np.abs(coefs[:top_male, 2])
        L_f = coefs[coefs.shape[0] - top_female:, 1]
        L_f = list(reversed(L_f))
        R_f = coefs[coefs.shape[0] - top_female:, 0]
        R_f = list(reversed(R_f))
        C_f = coefs[coefs.shape[0] - top_female:, 2]
        C_f = list(reversed(np.abs(C_f)))

    else:
        L_m = coefs[:top, 1]
        R_m = 3952 - coefs[:top, 0]
        C_m = np.abs(coefs[:top, 2])
        L_f = coefs[coefs.shape[0] - top:, 1]
        L_f = list(reversed(L_f))
        R_f = coefs[coefs.shape[0] - top:, 0]
        R_f = list(reversed(R_f))
        C_f = coefs[coefs.shape[0] - top:, 2]
        C_f = list(reversed(np.abs(C_f)))

    # Now, where we have the two lists, we can start obfuscating the data:
    #X = MD.load_user_item_matrix_1m()
    # np.random.shuffle(X)
    X_obf = np.copy(X)
    total_added = 0
    nr_skipped_users= 0
    for index, user in enumerate(X):
        if certainty[index] < certainty_threshold:
            nr_skipped_users+=1
            print(index, nr_skipped_users)
            continue
        k = 0
        for rating in user:
            if rating > 0:
                k += 1
        k *= p
        greedy_index = 0
        # print(k)
        added = 0
        if T[index] == 1:
            safety_counter = 0
            while added < k and safety_counter < 1000:
                if greedy_index >= len(L_m):
                    safety_counter = 1000
                    continue
                if sample_mode == 'greedy':
                    movie_id = L_m[greedy_index]
                if sample_mode == 'random':
                    movie_id = L_m[np.random.randint(0, len(L_m))]
                greedy_index += 1
                rating_count = sum([1 if x > 0 else 0 for x in X_obf[:, int(movie_id) - 1]])
                if rating_count > max_count[int(movie_id) - 1]:
                    continue
                if X_obf[index, int(movie_id) - 1] == 0:
                    X_obf[index, int(movie_id) - 1] = avg_ratings[int(movie_id) - 1]
                    added += 1
                safety_counter += 1
        elif T[index] == 0:
            safety_counter = 0
            while added < k and safety_counter < 1000:
                if greedy_index >= len(L_f):
                    safety_counter = 1000
                    continue
                if sample_mode == 'greedy':
                    movie_id = L_f[greedy_index]
                if sample_mode == 'random':
                    movie_id = L_f[np.random.randint(0, len(L_f))]
                greedy_index += 1
                rating_count = sum([1 if x > 0 else 0 for x in X_obf[:, int(movie_id) - 1]])
                if rating_count > max_count[int(movie_id) - 1]:
                    continue

                if X_obf[index, int(movie_id) - 1] == 0:
                    X_obf[index, int(movie_id) - 1] = avg_ratings[int(movie_id) - 1]
                    added += 1
                safety_counter += 1
        total_added += added
    print("nr of skipped users:", nr_skipped_users)
    # Now remove ratings from users that have more than 200 ratings equally:
    nr_many_ratings = 0
    for user in X:
        rating_count = sum([1 if x > 0 else 0 for x in user])
        if rating_count > 200:
            nr_many_ratings += 1
    nr_remove = total_added / nr_many_ratings

    for user_index, user in enumerate(X):
        rating_count = sum([1 if x > 0 else 0 for x in user])
        if rating_count > 200:
            to_be_removed_indecies = np.random.choice(np.argwhere(user > 0)[:, 0], size=(int(nr_remove),),
                                                      replace=False)
            X_obf[user_index, to_be_removed_indecies] = 0

    # finally, shuffle the user vectors:
    # np.random.shuffle(X_obf)
    # output the data in a file:
    output_file = ""
    if dataset == 'ML':
        output_file = "ml-1m/"
        with open(output_file + "blurmebetter_obfuscated_" + sample_mode + "_" + str(p) + "_" + str(
                notice_factor) + "_c" + str(certainty_threshold) + ".dat", 'w') as f:
            for index_user, user in enumerate(X_obf):
                for index_movie, rating in enumerate(user):
                    if rating > 0:
                        f.write(
                            str(index_user + 1) + "::" + str(index_movie + 1) + "::" + str(
                                int(np.round(rating))) + "::000000000\n")
    elif dataset == 'Fx':
        import FlixsterData as FD
        output_file = "Flixster/"
        user_id2index, user_index2id = FD.load_user_id_index_dict()
        movie_id2index, movie_index2id = FD.load_movie_id_index_dict()

        with open(output_file + "FX_blurmebetter_obfuscated_" + sample_mode + "_" + str(p) + "_" + str(
                notice_factor) + "_c" + str(certainty_threshold) + ".dat",
                  'w') as f:
            for index_user, user in enumerate(X_obf):
                for index_movie, rating in enumerate(user):
                    if rating > 0:
                        f.write(str(user_index2id[index_user]) + "::" + str(movie_index2id[index_movie]) + "::" + str(
                            int(np.round(rating))) + "::000000000\n")
    else:
        with open("libimseti/LST_blurmebetter_obfuscated_" + sample_mode + "_" + str(p) + "_" + str(
                notice_factor) + "_c" + str(certainty_threshold) + ".dat", 'w') as f:
            for index_user, user in enumerate(X_obf):
                for index_movie, rating in enumerate(user):
                    if rating > 0:
                        f.write(str(index_user+1) + "::" + str(index_movie+1) + "::" + str(
                            int(np.round(rating))) + "::000000000\n")

    return X_obf