コード例 #1
0
def Similarity(df_train, df_test, df_test_rec_items, grid_area_dict, pp):
    df_train = funcs.update_grid_group(df_train, grid_area_dict)

    # random forest
    model_rf = funcs.train_rf_model_check_in(df_train)
    # xgboost
    model_xgb = funcs.train_xgb_model_check_in(df_train)
    print("model training over...")

    df_test_rec_items = funcs.update_grid_group(df_test_rec_items,
                                                grid_area_dict)

    print("start obfuscating...")

    X_obf_dict = {}

    # get similarity matrix
    itemCols = df_test.columns[:-4]
    df_items = df_test[itemCols]
    sim_mat = cosine_similarity(df_items.values)

    for i in range(50):
        X_obf_dict[i], _ = funcs.get_similarity_obf_X(sim_mat, df_test, pp)
    _, X_ori = funcs.get_similarity_obf_X(sim_mat, df_test, pp)

    print("obfuscating done.")

    for i in X_ori.keys():
        user_grid = X_ori[i][-2]
        X_ori[i][-3] = grid_area_dict[user_grid]
        for j in range(50):
            X_obf_dict[j][i][-1] = grid_area_dict[user_grid]

    return X_obf_dict, X_ori, model_rf, model_xgb
コード例 #2
0
def Random(df_train, df_test, grid_area_dict, df_test_rec_items, p_rand, pp):
    df_train = funcs.update_grid_group(df_train, grid_area_dict)

    # random forest
    model_rf = funcs.train_rf_model_check_in(df_train)
    # xgboost
    model_xgb = funcs.train_xgb_model_check_in(df_train)
    print("model training over...")

    df_test_rec_items = funcs.update_grid_group(df_test_rec_items,
                                                grid_area_dict)

    print("start obfuscating...")

    X_obf_dict = {}
    for i in range(50):
        X_obf_dict[i], _ = funcs.get_random_obf_X(df_test, p_rand, pp)
    _, X_ori = funcs.get_random_obf_X(df_test, p_rand, pp)

    print("obfuscating done.")

    for i in X_ori.keys():
        user_grid = X_ori[i][-2]
        X_ori[i][-3] = grid_area_dict[user_grid]
        for j in range(50):
            X_obf_dict[j][i][-1] = grid_area_dict[user_grid]
    return X_obf_dict, X_ori, model_rf, model_xgb
コード例 #3
0
def differential_privacy(df_train, df_test, grid_area_dict, df_test_rec_items,
                         beta):
    df_train = funcs.update_grid_group(df_train, grid_area_dict)

    model_rf = funcs.train_rf_model_check_in(df_train)
    # xgboost
    model_xgb = funcs.train_xgb_model_check_in(df_train)
    print("model training over...")

    dist_mat = dist.squareform(dist.pdist(df_test.values[:, :-4], 'euclidean'))
    dist_mat = normalize(dist_mat, axis=1, norm='max')

    df_test_rec_items = funcs.update_grid_group(df_test_rec_items,
                                                grid_area_dict)

    print("start obfuscating...")

    X_obf_dict = {}
    for i in range(50):
        X_obf_dict[i], _ = funcs.get_DP_obf_X(df_test, dist_mat, beta)
    _, X_ori = funcs.get_DP_obf_X(df_test, dist_mat, beta)

    print("obfuscating done.")

    for i in X_ori.keys():
        user_grid = X_ori[i][-2]
        X_ori[i][-3] = grid_area_dict[user_grid]
        for j in range(50):
            X_obf_dict[j][i][-1] = grid_area_dict[user_grid]

    return X_obf_dict, X_ori, model_rf, model_xgb
コード例 #4
0
def PrivCheck(df_train, df_test, df_test_rec_items, grid_area_dict,
              area_grid_dict, grid_list, cluster_num, grid_area_number, deltaX,
              pp):
    df_train = funcs.update_grid_group(df_train, grid_area_dict)
    model_rf = funcs.train_rf_model_check_in(df_train)
    # xgboost
    model_xgb = funcs.train_xgb_model_check_in(df_train)

    pd.DataFrame(
        funcs.cal_pgy_withoutGridGroup(df_test, cluster_num,
                                       grid_list)).to_csv(
                                           'tmp/pgy_check_in_privcheck.csv',
                                           index=False,
                                           header=None)

    df_test = funcs.update_grid_group(df_test, grid_area_dict)

    JSD_Mat_dict = np.zeros((cluster_num, cluster_num, grid_area_number))
    group_user_size_dict = {}

    for gg in range(grid_area_number):
        df_test_gg = df_test.loc[df_test['grid_group'] == gg]
        grid_list_gg = area_grid_dict[gg]
        group_user_size_dict[gg] = df_test_gg.shape[0]

        JSD_Mat_dict[:, :, gg] = funcs.cal_JSD_Matrix_withoutGridGroup(
            df_test_gg, cluster_num, 4)

    scipy.io.savemat('tmp/JSDM_girdGroup_privcheck.mat',
                     {"JSD_Mat_input_Yang_trueTrain": JSD_Mat_dict})

    eng = matlab.engine.start_matlab()
    eng.edit("../../matlab/checkin_tradeoff_scenario_I/PrivCheck", nargout=0)
    eng.cd('../../matlab/checkin_tradeoff_scenario_I', nargout=0)
    xpgg, distortion_budget = np.array(eng.PrivCheck(deltaX, nargout=2))
    xpgg = np.array(xpgg)

    df_test['grid_group'] = pd.Series(np.zeros(df_test.shape[0]),
                                      index=df_test.index,
                                      dtype='int32')

    X_obf_dict = {}
    for i in range(50):
        X_obf_dict[i], _ = funcs.get_obf_X(df_test, xpgg, pp)

    _, X_ori = funcs.get_obf_X(df_test, xpgg, pp)

    for i in X_ori.keys():
        user_grid = X_ori[i][-2]
        X_ori[i][-3] = grid_area_dict[user_grid]
        for j in range(50):
            X_obf_dict[j][i][-1] = grid_area_dict[user_grid]

    df_test_rec_items = funcs.update_grid_group(df_test_rec_items,
                                                grid_area_dict)

    return X_obf_dict, X_ori, model_rf, model_xgb
コード例 #5
0
def Similarity(df_train, grid_area_dict, pp):
    print("start obfuscating...")
    df_train = funcs.update_grid_group(df_train, grid_area_dict)

    itemCols = df_train.columns[:-4]
    df_items = df_train[itemCols]
    sim_mat = cosine_similarity(df_items.values)

    X_obf_dict = {}

    for i in range(25):
        print("obfuscation {}".format(i))
        X_obf_dict[i], _ = funcs.get_similarity_obf_X_withAgeGroup(
            sim_mat, df_train, pp)

    _, X_ori = funcs.get_similarity_obf_X_withAgeGroup(sim_mat, df_train, pp)

    print("obfuscating done.")

    for i in X_ori.keys():
        user_grid = X_ori[i][-2]
        X_ori[i][-3] = grid_area_dict[user_grid]
        for j in range(25):
            X_obf_dict[j][i][-1] = grid_area_dict[user_grid]

    return X_obf_dict, X_ori
コード例 #6
0
def differential_privacy(df_train, grid_area_dict, grid_area_number, beta):
    print("start obfuscating...")

    df_train = funcs.update_grid_group(df_train, grid_area_dict)

    dist_mat_dict = {}
    for i in range(grid_area_number):
        dist_mat_dict[i] = normalize(dist.squareform(
            dist.pdist(df_train[df_train['grid_group'] == i].values[:, :-4],
                       'euclidean')),
                                     axis=1,
                                     norm='max')

    X_obf_dict = {}
    for i in range(25):
        X_obf_dict[i], _ = funcs.get_DP_obf_X_withAgeGroup(
            df_train, dist_mat_dict, beta)
        # X_obf_dict[i], _ = get_DP_obf_X(df_train, dist_mat, beta)
    _, X_ori = funcs.get_DP_obf_X_withAgeGroup(df_train, dist_mat_dict, beta)
    # _, X_ori = get_DP_obf_X(df_train, dist_mat, beta)
    print("obfuscating done.")

    for i in X_ori.keys():
        user_grid = X_ori[i][-2]
        X_ori[i][-3] = grid_area_dict[user_grid]
        for j in range(25):
            X_obf_dict[j][i][-1] = grid_area_dict[user_grid]
    return X_obf_dict, X_ori
コード例 #7
0
ファイル: obfuscations.py プロジェクト: scottshufe/HyObscure
def PrivCheck(df_train, cluster_num, grid_list, grid_area_dict,
              grid_area_number, area_grid_dict, deltaX, pp):
    pd.DataFrame(
        funcs.cal_pgy_withoutGridGroup(df_train, cluster_num,
                                       grid_list)).to_csv(
                                           'tmp/pgy_check_in_privcheck.csv',
                                           index=False,
                                           header=None)
    df_train = funcs.update_grid_group(df_train, grid_area_dict)
    JSD_Mat_dict = np.zeros((cluster_num, cluster_num, grid_area_number))
    group_user_size_dict = {}

    for gg in range(grid_area_number):
        df_train_gg = df_train.loc[df_train['grid_group'] == gg]
        grid_list_gg = area_grid_dict[gg]
        group_user_size_dict[gg] = df_train_gg.shape[0]

        JSD_Mat_dict[:, :, gg] = funcs.cal_JSD_Matrix_withoutGridGroup(
            df_train_gg, cluster_num, 4)

    scipy.io.savemat('tmp/JSDM_girdGroup_privcheck.mat',
                     {"JSD_Mat_input_Yang_allObf": JSD_Mat_dict})

    eng = matlab.engine.start_matlab()
    eng.edit("../../matlab/checkin_clusternum_scenario_II/PrivCheck",
             nargout=0)
    eng.cd('../../matlab/checkin_clusternum_scenario_II', nargout=0)
    xpgg, distortion_budget = np.array(eng.PrivCheck(deltaX, nargout=2))
    xpgg = np.array(xpgg)

    df_train = funcs.update_grid_group(df_train, grid_area_dict)
    X_obf_dict = {}
    for i in range(25):
        X_obf_dict[i], _ = funcs.get_obf_X_withAgeGroup(df_train, xpgg, pp)

    _, X_ori = funcs.get_obf_X_withAgeGroup(df_train, xpgg, pp)

    for i in X_ori.keys():
        user_grid = X_ori[i][-2]
        X_ori[i][-3] = grid_area_dict[user_grid]
        for j in range(25):
            X_obf_dict[j][i][-1] = grid_area_dict[user_grid]

    return X_obf_dict, X_ori
コード例 #8
0
def Random(df_train, grid_area_dict, p_rand):
    print("start obfuscating...")
    df_train = funcs.update_grid_group(df_train, grid_area_dict)
    X_obf_dict = {}
    for i in range(25):
        X_obf_dict[i], _ = funcs.get_random_obf_X_withAgeGroup(
            df_train, p_rand)

    _, X_ori = funcs.get_random_obf_X_withAgeGroup(df_train, p_rand)

    print("obfuscating done.")

    for i in X_ori.keys():
        user_grid = X_ori[i][-2]
        X_ori[i][-3] = grid_area_dict[user_grid]
        for j in range(25):
            X_obf_dict[j][i][-1] = grid_area_dict[user_grid]

    return X_obf_dict, X_ori
コード例 #9
0
def Frapp(df_train, grid_area_dict, gamma):
    print("start obfuscating...")

    df_train = funcs.update_grid_group(df_train, grid_area_dict)

    X_obf_dict = {}
    for i in range(25):
        X_obf_dict[i], _ = funcs.get_frapp_obf_X_withAgeGroup(df_train, gamma)
        # X_obf_dict[i], _ = get_frapp_obf_X(df_train, gamma)
    _, X_ori = funcs.get_frapp_obf_X_withAgeGroup(df_train, gamma)
    # _, X_ori = get_frapp_obf_X(df_train, gamma)
    print("obfuscating done.")

    for i in X_ori.keys():
        user_grid = X_ori[i][-2]
        X_ori[i][-3] = grid_area_dict[user_grid]
        for j in range(25):
            X_obf_dict[j][i][-1] = grid_area_dict[user_grid]

    return X_obf_dict, X_ori
コード例 #10
0
ファイル: main.py プロジェクト: scottshufe/HyObscure
        pass
    else:
        os.makedirs('tmp')

    df = read_data()
    grid_list = list(set(df['grid'].values))
    grid_list.sort()
    grid_colrow, grid_rowcol = get_grid_loc(grid_list)

    print("initiate area...")
    area_grid_rowcol_dict, area_grid_colrow_dict, area_grid_dict, grid_area_dict, area_reducibility = area_initiate(
        grid_rowcol, grid_area_number)
        
    df['grid_group'] = pd.Series(np.zeros(df.shape[0]), index=df.index, dtype='int32')
    if method in ['HyObscure', 'YGen', 'XObf']:
        df_grid_group = funcs.update_grid_group(df, grid_area_dict)
    else:
        df_grid_group = df
    cols = list(df.columns.values)
    cols_change = cols[:-3]
    cols_change.extend(['grid_group', 'grid', 'uid'])
    df_item_gridGroup_uid = df_grid_group[cols_change]

    df_cluster = funcs.hierarchical_clustering(df_item_gridGroup_uid, cluster_num, -3, 'cosine', 'complete')
    drop_idx = df_cluster.loc[df_cluster['cluster']==5].index
    df_cluster.drop(drop_idx, inplace=True)
    drop_idx = []
    for i in df_cluster.grid.value_counts().index:
        if df_cluster.grid.value_counts()[i] < 3:
            drop_idx.extend(df_cluster.loc[df_cluster['grid']==i].index)
    df_cluster.drop(drop_idx, inplace=True)
コード例 #11
0
    df = read_data()
    grid_list = list(set(df['grid'].values))
    grid_list.sort()
    grid_colrow, grid_rowcol = get_grid_loc(grid_list)

    print("initiate area...")
    area_grid_rowcol_dict, area_grid_colrow_dict, area_grid_dict, grid_area_dict, area_reducibility = area_initiate(
        grid_rowcol, grid_area_number)

    df['grid_group'] = pd.Series(np.zeros(df.shape[0]),
                                 index=df.index,
                                 dtype='int32')

    if method in ['HyObscure', 'YGen', 'XObf']:
        df_grid_group = funcs.update_grid_group(df, grid_area_dict)
    else:
        df_grid_group = df
    cols = list(df.columns.values)
    cols_change = cols[:-3]
    cols_change.extend(['grid_group', 'grid', 'uid'])
    df_item_gridGroup_uid = df_grid_group[cols_change]

    for cluster_num in cluster_num_list:

        df_cluster = funcs.hierarchical_clustering(df_item_gridGroup_uid,
                                                   cluster_num, -3, 'cosine',
                                                   'complete')

        results = {}
        results['ori_acc_rf'] = []
コード例 #12
0
ファイル: obfuscations.py プロジェクト: scottshufe/HyObscure
def HyObscure(df_train, grid_area_dict, area_grid_dict, cluster_num,
              grid_area_number, grid_list, area_reducibility,
              area_grid_rowcol_dict, area_grid_colrow_dict, method,
              grid_rowcol, grid_colrow, l_threshold, k_threshold, deltaX, pp):
    df_train_copy = copy.deepcopy(df_train)
    df_train_copy['grid_group'] = pd.Series(np.zeros(df_train_copy.shape[0]),
                                            index=df_train_copy.index,
                                            dtype='int32')
    user_num = df_train_copy.shape[0]
    X_ori = {}
    for k in range(user_num):
        user_id = df_train_copy['uid'][k]
        X_ori[user_id] = df_train_copy[df_train_copy['uid'] == user_id].values[
            0, :-1]
    for k in X_ori.keys():
        user_grid = X_ori[k][-2]
        X_ori[k][-3] = grid_area_dict[user_grid]

    for i in area_grid_dict:
        print("user number in area ", i, " is ",
              funcs.k_anonymity(df_train, area_grid_dict[i]))

    print("start solving xpgg...")
    xpgg = np.ones((cluster_num * grid_area_number,
                    cluster_num * grid_area_number)) * 0.00000001
    JSD_Mat = np.ones(
        (cluster_num * grid_area_number, cluster_num * grid_area_number))
    pgy = np.ones(
        (len(grid_list), cluster_num * grid_area_number)) * 0.00000001

    JSD_Mat_dict = {}
    pgy_dict = {}

    for op in range(0, 6):
        ## compute JSD and pgy
        JSD_Mat, pgy, JSD_Mat_dict, pgy_dict = funcs.get_JSD_PGY(
            df_train, area_grid_dict, JSD_Mat_dict, cluster_num, pgy_dict,
            JSD_Mat, pgy, method)
        print('op:', op)
        grid_xpgg_dict = {}
        ## compute xpgg
        for gg in range(0, grid_area_number):
            eng = matlab.engine.start_matlab()
            eng.edit('../../matlab/checkin_clusternum_scenario_II/HyObscure',
                     nargout=0)
            eng.cd('../../matlab/checkin_clusternum_scenario_II', nargout=0)
            grid_xpgg_dict[gg] = np.array(eng.HyObscure(deltaX, gg))

            for row in range(cluster_num):
                for col in range(cluster_num):
                    xpgg[gg + row * grid_area_number,
                         gg + col * grid_area_number] = grid_xpgg_dict[gg][row,
                                                                           col]

        mean_Utility = funcs.Mean_JSD(JSD_Mat, xpgg)
        mean_Privacy = funcs.Mean_KL_div(pgy, xpgg)
        min_mean_Utility = mean_Utility
        min_mean_Privacy = mean_Privacy
        ## area_grid_rowcol_dict, area_grid_colrow_dict, area_grid_dict, grid_area_dict, area_reducibility
        areas = list(area_grid_dict.keys())
        random.shuffle(areas)
        ### change grid group (area) by stochastic privacy-utility boosting
        for area_code in areas:  ##select one area to adjust
            area_grids = area_grid_dict[
                area_code]  ## get all the grids in the area

            l_cur = funcs.l_diversity(df_train,
                                      area_grids)  ## check l diversity
            l_range = int(np.exp(l_cur) - np.exp(np.log(l_threshold)))
            print('start adjusting area: ', area_code)
            if l_range > 0:
                ### select one direction to adjust: left (0); right (1); up (2); down(3)
                d = np.random.choice([0, 1, 2, 3],
                                     p=area_reducibility[area_code] /
                                     np.sum(area_reducibility[area_code]))
                # the selected area can be reduced through the selected direction
                if d < 2:  ## change left or right
                    area_grid_line_list_dict = area_grid_rowcol_dict
                    line_list_to_grid = funcs.rowcol_to_grid
                    grid_linelist = grid_rowcol
                else:  ## change up or down
                    area_grid_line_list_dict = area_grid_colrow_dict
                    line_list_to_grid = funcs.colrow_to_grid
                    grid_linelist = grid_colrow
                area_lines = list(area_grid_line_list_dict[area_code].keys())
                area_lines.sort()
                for line in area_lines:
                    # recheck area l diversity
                    area_grids = area_grid_dict[
                        area_code]  ## get all the grids in the area
                    l_cur = funcs.l_diversity(df_train,
                                              area_grids)  ## check l diversity
                    l_range = int(np.exp(l_cur) - np.exp(np.log(l_threshold)))

                    change_range = l_range
                    line_lists = area_grid_line_list_dict[area_code][line]
                    line_lists.sort()
                    line_lists_len = len(line_lists)
                    if change_range > line_lists_len:
                        change_range = line_lists_len
                    for i in range(1, change_range + 1):
                        if d == 0 or d == 3:
                            moveout_grid_lists = line_lists[-i:]
                        elif d == 1 or d == 2:
                            moveout_grid_lists = line_lists[:i]
                        moveout_grids = []
                        for mgc in moveout_grid_lists:
                            moveout_grids.append(line_list_to_grid(line, mgc))
                        adjusted_area_grids = list(
                            set(area_grids) - set(moveout_grids))

                        ## check k anonymity
                        k_adjust = funcs.k_anonymity(df_train,
                                                     adjusted_area_grids)

                        ## the adjusted schema meets both k-anonymity and l-diversity
                        if k_adjust >= k_threshold:
                            if d == 0:
                                to_area = area_code + 1
                            elif d == 1:
                                to_area = area_code - 1
                            elif d == 2:
                                to_area = area_code - int(
                                    grid_area_number /
                                    int(np.sqrt(grid_area_number)))
                            elif d == 3:
                                to_area = area_code + int(
                                    grid_area_number /
                                    int(np.sqrt(grid_area_number)))

                            ## adjust grid groups (areas): update area_grid_dict and grid_area_dict
                            area_grid_dict_cur = copy.deepcopy(area_grid_dict)
                            adjusted_area_grids.sort()
                            area_grid_dict_cur[area_code] = adjusted_area_grids
                            area_grid_dict_cur[to_area] = list(
                                set(area_grid_dict_cur[to_area])
                                | set(moveout_grids))
                            area_grid_dict_cur[to_area].sort()
                            grid_area_dict_cur = copy.deepcopy(grid_area_dict)
                            for grid in moveout_grids:
                                grid_area_dict_cur[grid] = to_area

                            for i in area_grid_dict_cur:
                                print("area:", i, "grid number:",
                                      len(area_grid_dict_cur[i]))

                            print('from area: ', area_code, 'to area: ',
                                  to_area, 'change line: ', line,
                                  'moveout_grids: ', moveout_grids)

                            df_train_new = funcs.update_grid_group(
                                df_train, grid_area_dict_cur)
                            # try:
                            new_JSD_Mat, new_pgy, new_JSD_Mat_dict, new_pgy_dict = funcs.get_JSD_PGY(
                                df_train_new, area_grid_dict_cur, JSD_Mat_dict,
                                cluster_num, pgy_dict, JSD_Mat, pgy, method)

                            new_mean_Utility = funcs.Mean_JSD(
                                new_JSD_Mat, xpgg)
                            new_mean_Privacy = funcs.Mean_KL_div(new_pgy, xpgg)

                            if new_mean_Privacy < min_mean_Privacy and new_mean_Utility < min_mean_Utility:
                                min_mean_Utility = new_mean_Utility
                                min_mean_Privacy = new_mean_Privacy
                                min_grid_area_dict = grid_area_dict_cur
                                min_area_grid_dict = area_grid_dict_cur
                                min_df_train = df_train_new

                                grid_area_dict = min_grid_area_dict
                                area_grid_dict = min_area_grid_dict
                                df_train = min_df_train
                                min_distortion_budget = min_mean_Utility
                                area_grid_rowcol_dict, area_grid_colrow_dict = funcs.update_rowcol_colrow_dict(
                                    area_grid_dict)
                                print("! Find a better area group")
                                break

                            print(op, area_code, to_area, line, mgc,
                                  mean_Privacy, mean_Utility, min_mean_Privacy,
                                  min_mean_Utility, new_mean_Privacy,
                                  new_mean_Utility)

                        else:
                            print("*** area not meet k_anonymity requirement")
            else:
                print("*** area not meet l_diversity requirement")

    df_train = funcs.update_grid_group(df_train, grid_area_dict)

    X_obf_dict = {}
    for i in range(25):
        X_obf_dict[i], _ = funcs.get_obf_X(df_train, xpgg, pp)

    return X_obf_dict, X_ori
コード例 #13
0
            cluster_col.append(6)
        elif i == 8:
            cluster_col.append(7)
        else:
            cluster_col.append(i)
    df_cluster.cluster = cluster_col
    cluster_num = len(set(df_cluster.cluster))

    for grid_area_number in grid_area_number_list:

        print("initiate area...")
        area_grid_rowcol_dict, area_grid_colrow_dict, area_grid_dict, grid_area_dict, area_reducibility = area_initiate(
            grid_rowcol, grid_area_number)

        if method in ['HyObscure', 'YGen', 'XObf']:
            df_cluster = funcs.update_grid_group(df_cluster, grid_area_dict)

        results = {}
        results['ori_acc_rf'] = []
        results['obf_acc_rf'] = []
        results['ori_acc_xgb'] = []
        results['obf_acc_xgb'] = []
        results['rec_ori'] = []
        results['rec_obf'] = []

        for r in range(20):
            if method in ['HyObscure', 'YGen']:
                area_grid_rowcol_dict, area_grid_colrow_dict, area_grid_dict, grid_area_dict, area_reducibility = area_initiate(
                    grid_rowcol, grid_area_number)
                df_cluster = funcs.update_grid_group(df_cluster,
                                                     grid_area_dict)