コード例 #1
0
ファイル: obfuscations.py プロジェクト: scottshufe/HyObscure
def HyObscure(df_train, grid_area_dict, area_grid_dict, cluster_num,
              grid_area_number, grid_list, area_reducibility,
              area_grid_rowcol_dict, area_grid_colrow_dict, method,
              grid_rowcol, grid_colrow, l_threshold, k_threshold, deltaX, pp):
    df_train_copy = copy.deepcopy(df_train)
    df_train_copy['grid_group'] = pd.Series(np.zeros(df_train_copy.shape[0]),
                                            index=df_train_copy.index,
                                            dtype='int32')
    user_num = df_train_copy.shape[0]
    X_ori = {}
    for k in range(user_num):
        user_id = df_train_copy['uid'][k]
        X_ori[user_id] = df_train_copy[df_train_copy['uid'] == user_id].values[
            0, :-1]
    for k in X_ori.keys():
        user_grid = X_ori[k][-2]
        X_ori[k][-3] = grid_area_dict[user_grid]

    for i in area_grid_dict:
        print("user number in area ", i, " is ",
              funcs.k_anonymity(df_train, area_grid_dict[i]))

    print("start solving xpgg...")
    xpgg = np.ones((cluster_num * grid_area_number,
                    cluster_num * grid_area_number)) * 0.00000001
    JSD_Mat = np.ones(
        (cluster_num * grid_area_number, cluster_num * grid_area_number))
    pgy = np.ones(
        (len(grid_list), cluster_num * grid_area_number)) * 0.00000001

    JSD_Mat_dict = {}
    pgy_dict = {}

    for op in range(0, 6):
        ## compute JSD and pgy
        JSD_Mat, pgy, JSD_Mat_dict, pgy_dict = funcs.get_JSD_PGY(
            df_train, area_grid_dict, JSD_Mat_dict, cluster_num, pgy_dict,
            JSD_Mat, pgy, method)
        print('op:', op)
        grid_xpgg_dict = {}
        ## compute xpgg
        for gg in range(0, grid_area_number):
            eng = matlab.engine.start_matlab()
            eng.edit('../../matlab/checkin_clusternum_scenario_II/HyObscure',
                     nargout=0)
            eng.cd('../../matlab/checkin_clusternum_scenario_II', nargout=0)
            grid_xpgg_dict[gg] = np.array(eng.HyObscure(deltaX, gg))

            for row in range(cluster_num):
                for col in range(cluster_num):
                    xpgg[gg + row * grid_area_number,
                         gg + col * grid_area_number] = grid_xpgg_dict[gg][row,
                                                                           col]

        mean_Utility = funcs.Mean_JSD(JSD_Mat, xpgg)
        mean_Privacy = funcs.Mean_KL_div(pgy, xpgg)
        min_mean_Utility = mean_Utility
        min_mean_Privacy = mean_Privacy
        ## area_grid_rowcol_dict, area_grid_colrow_dict, area_grid_dict, grid_area_dict, area_reducibility
        areas = list(area_grid_dict.keys())
        random.shuffle(areas)
        ### change grid group (area) by stochastic privacy-utility boosting
        for area_code in areas:  ##select one area to adjust
            area_grids = area_grid_dict[
                area_code]  ## get all the grids in the area

            l_cur = funcs.l_diversity(df_train,
                                      area_grids)  ## check l diversity
            l_range = int(np.exp(l_cur) - np.exp(np.log(l_threshold)))
            print('start adjusting area: ', area_code)
            if l_range > 0:
                ### select one direction to adjust: left (0); right (1); up (2); down(3)
                d = np.random.choice([0, 1, 2, 3],
                                     p=area_reducibility[area_code] /
                                     np.sum(area_reducibility[area_code]))
                # the selected area can be reduced through the selected direction
                if d < 2:  ## change left or right
                    area_grid_line_list_dict = area_grid_rowcol_dict
                    line_list_to_grid = funcs.rowcol_to_grid
                    grid_linelist = grid_rowcol
                else:  ## change up or down
                    area_grid_line_list_dict = area_grid_colrow_dict
                    line_list_to_grid = funcs.colrow_to_grid
                    grid_linelist = grid_colrow
                area_lines = list(area_grid_line_list_dict[area_code].keys())
                area_lines.sort()
                for line in area_lines:
                    # recheck area l diversity
                    area_grids = area_grid_dict[
                        area_code]  ## get all the grids in the area
                    l_cur = funcs.l_diversity(df_train,
                                              area_grids)  ## check l diversity
                    l_range = int(np.exp(l_cur) - np.exp(np.log(l_threshold)))

                    change_range = l_range
                    line_lists = area_grid_line_list_dict[area_code][line]
                    line_lists.sort()
                    line_lists_len = len(line_lists)
                    if change_range > line_lists_len:
                        change_range = line_lists_len
                    for i in range(1, change_range + 1):
                        if d == 0 or d == 3:
                            moveout_grid_lists = line_lists[-i:]
                        elif d == 1 or d == 2:
                            moveout_grid_lists = line_lists[:i]
                        moveout_grids = []
                        for mgc in moveout_grid_lists:
                            moveout_grids.append(line_list_to_grid(line, mgc))
                        adjusted_area_grids = list(
                            set(area_grids) - set(moveout_grids))

                        ## check k anonymity
                        k_adjust = funcs.k_anonymity(df_train,
                                                     adjusted_area_grids)

                        ## the adjusted schema meets both k-anonymity and l-diversity
                        if k_adjust >= k_threshold:
                            if d == 0:
                                to_area = area_code + 1
                            elif d == 1:
                                to_area = area_code - 1
                            elif d == 2:
                                to_area = area_code - int(
                                    grid_area_number /
                                    int(np.sqrt(grid_area_number)))
                            elif d == 3:
                                to_area = area_code + int(
                                    grid_area_number /
                                    int(np.sqrt(grid_area_number)))

                            ## adjust grid groups (areas): update area_grid_dict and grid_area_dict
                            area_grid_dict_cur = copy.deepcopy(area_grid_dict)
                            adjusted_area_grids.sort()
                            area_grid_dict_cur[area_code] = adjusted_area_grids
                            area_grid_dict_cur[to_area] = list(
                                set(area_grid_dict_cur[to_area])
                                | set(moveout_grids))
                            area_grid_dict_cur[to_area].sort()
                            grid_area_dict_cur = copy.deepcopy(grid_area_dict)
                            for grid in moveout_grids:
                                grid_area_dict_cur[grid] = to_area

                            for i in area_grid_dict_cur:
                                print("area:", i, "grid number:",
                                      len(area_grid_dict_cur[i]))

                            print('from area: ', area_code, 'to area: ',
                                  to_area, 'change line: ', line,
                                  'moveout_grids: ', moveout_grids)

                            df_train_new = funcs.update_grid_group(
                                df_train, grid_area_dict_cur)
                            # try:
                            new_JSD_Mat, new_pgy, new_JSD_Mat_dict, new_pgy_dict = funcs.get_JSD_PGY(
                                df_train_new, area_grid_dict_cur, JSD_Mat_dict,
                                cluster_num, pgy_dict, JSD_Mat, pgy, method)

                            new_mean_Utility = funcs.Mean_JSD(
                                new_JSD_Mat, xpgg)
                            new_mean_Privacy = funcs.Mean_KL_div(new_pgy, xpgg)

                            if new_mean_Privacy < min_mean_Privacy and new_mean_Utility < min_mean_Utility:
                                min_mean_Utility = new_mean_Utility
                                min_mean_Privacy = new_mean_Privacy
                                min_grid_area_dict = grid_area_dict_cur
                                min_area_grid_dict = area_grid_dict_cur
                                min_df_train = df_train_new

                                grid_area_dict = min_grid_area_dict
                                area_grid_dict = min_area_grid_dict
                                df_train = min_df_train
                                min_distortion_budget = min_mean_Utility
                                area_grid_rowcol_dict, area_grid_colrow_dict = funcs.update_rowcol_colrow_dict(
                                    area_grid_dict)
                                print("! Find a better area group")
                                break

                            print(op, area_code, to_area, line, mgc,
                                  mean_Privacy, mean_Utility, min_mean_Privacy,
                                  min_mean_Utility, new_mean_Privacy,
                                  new_mean_Utility)

                        else:
                            print("*** area not meet k_anonymity requirement")
            else:
                print("*** area not meet l_diversity requirement")

    df_train = funcs.update_grid_group(df_train, grid_area_dict)

    X_obf_dict = {}
    for i in range(25):
        X_obf_dict[i], _ = funcs.get_obf_X(df_train, xpgg, pp)

    return X_obf_dict, X_ori
コード例 #2
0
def HyObscure(df_train, df_test, df_test_rec_items, df_item_age_uid,
              age_group_dict, group_age_dict, cluster_num, age_group_number,
              age_list, deltaX, k_threshold, l_threshold, pp):
    df_test_copy = copy.deepcopy(df_test)
    df_test_copy['age_group'] = pd.Series(np.zeros(df_test_copy.shape[0]),
                                          index=df_test_copy.index,
                                          dtype='int32')
    xpgg = np.ones((cluster_num * age_group_number,
                    cluster_num * age_group_number)) * 0.00000001
    JSD_Mat = np.ones(
        (cluster_num * age_group_number, cluster_num * age_group_number))
    pgy = np.ones((len(age_list), cluster_num * age_group_number)) * 0.00000001
    group_min_age_dict = {}
    group_usersize_dict = {}

    for op in range(0, 5):
        age_xpgg_dict = {}
        ###### Compute JSD, pgy, xpgg
        JSD_Mat_dict = {}
        pgy_dict = {}

        for ag in range(age_group_number):
            group_min_age_dict[ag] = group_age_dict[ag][0]
            print(group_min_age_dict[ag])
            df_test_ag = df_test.loc[df_test['age_group'] == ag]
            age_list_ag = group_age_dict[ag]
            group_usersize_dict[ag] = df_test_ag.shape[0]

            JSD_Mat_dict[ag] = funcs.cal_JSD_Matrix_withoutAgeGroup(
                df_test_ag, cluster_num, 4)
            print(ag, cluster_num, age_list_ag)
            pgy_dict[ag] = funcs.cal_pgy_withoutAgeGroup(
                df_test_ag, cluster_num, age_list_ag)

            pd.DataFrame(JSD_Mat_dict[ag]).to_csv(
                'tmp/JSDM_ageGroup_hyobscure.csv', index=False, header=None)
            pd.DataFrame(pgy_dict[ag]).to_csv('tmp/pgy_ageGroup_hyobscure.csv',
                                              index=False,
                                              header=None)

            eng = matlab.engine.start_matlab()
            eng.edit('../../matlab/age_clusternum_scenario_I/HyObscure',
                     nargout=0)
            eng.cd('../../matlab/age_clusternum_scenario_I', nargout=0)
            age_xpgg_dict[ag], distortion_budget = np.array(
                eng.HyObscure(deltaX, nargout=2))
            age_xpgg_dict[ag] = np.array(age_xpgg_dict[ag])

        for ag in range(age_group_number):
            for age in group_age_dict[ag]:
                for col in range(cluster_num):
                    pgy[age - group_min_age_dict[0], ag + col * age_group_number] = pgy_dict[ag][age -
                                                                                                 group_min_age_dict[
                                                                                                     ag], col] * \
                                                                                    group_usersize_dict[
                                                                                        ag] / \
                                                                                    df_test.shape[0]

        for ag in range(age_group_number):
            for row in range(cluster_num):
                for col in range(cluster_num):
                    xpgg[ag + row * age_group_number,
                         ag + col * age_group_number] = age_xpgg_dict[ag][row,
                                                                          col]
                    JSD_Mat[ag + row * age_group_number, ag +
                            col * age_group_number] = JSD_Mat_dict[ag][row,
                                                                       col]

        # pd.DataFrame(xpgg).to_csv('xpgg.csv', index=False, header=None)
        # pd.DataFrame(pgy).to_csv('pgy_full.csv', index=False, header=None)
        # pd.DataFrame(JSD_Mat).to_csv('JSD_full.csv', index=False, header=None)

        min_JSD_Mat = JSD_Mat
        min_pgy = pgy
        ### change age group by greedy approach
        mean_Utility = funcs.Mean_JSD(JSD_Mat, xpgg)
        mean_Privacy = funcs.Mean_KL_div(pgy, xpgg)
        min_mean_Utility = mean_Utility
        min_mean_Privacy = mean_Privacy

        adjustable_groups, reducible_groups = funcs.age_group_adjust_greedy(
            df_item_age_uid, group_age_dict, k_threshold, np.log(l_threshold))
        min_group = 0
        for i in adjustable_groups:
            age_group_dict_cur = {}
            for group, group_age_list in adjustable_groups[i].items():
                for age in group_age_list:
                    age_group_dict_cur[age] = group

            df_test_new = funcs.update_age_group(df_test, age_group_dict_cur)
            new_JSD_Mat = funcs.cal_JSD_Matrix_withAgeGroup(
                df_test_new, cluster_num, age_group_number, 4)
            new_pgy = funcs.cal_pgy_withAgeGroup(df_test_new, cluster_num,
                                                 age_group_number, age_list)
            new_mean_Utility = funcs.Mean_JSD(new_JSD_Mat, xpgg)
            new_mean_Privacy = funcs.Mean_KL_div(new_pgy, xpgg)
            if new_mean_Utility < min_mean_Utility and new_mean_Privacy < min_mean_Privacy:
                min_mean_Utility = new_mean_Utility
                min_mean_Privacy = new_mean_Privacy
                min_group_age_dict = copy.deepcopy(adjustable_groups[i])
                min_age_group_dict = copy.deepcopy(age_group_dict_cur)
                min_JSD_Mat = new_JSD_Mat
                min_pgy = new_pgy
                min_group = i
            print(op, i, min_group, mean_Privacy, mean_Utility,
                  min_mean_Privacy, min_mean_Utility, new_mean_Privacy,
                  new_mean_Utility)

        if min_mean_Privacy < mean_Privacy and min_mean_Utility < mean_Utility:
            print("find a better age group:", group_age_dict)
            age_group_dict = min_age_group_dict
            group_age_dict = min_group_age_dict
            df_test = funcs.update_age_group(df_test, age_group_dict)
        else:
            break

    user_num = df_test_copy.shape[0]
    X_ori = {}
    for k in range(user_num):
        user_id = df_test_copy['uid'][k]
        X_ori[user_id] = df_test_copy[df_test_copy['uid'] == user_id].values[
            0, :-1]
    for k in X_ori.keys():
        user_age = X_ori[k][-2]
        X_ori[k][-3] = age_group_dict[user_age]

    df_test = funcs.update_age_group(df_test, age_group_dict)
    df_train = funcs.update_age_group(df_train, age_group_dict)
    df_test_rec_items = funcs.update_age_group(df_test_rec_items,
                                               age_group_dict)

    model_rf = funcs.train_rf_model(df_train)
    model_xgb = funcs.train_xgb_model(df_train)
    print("model train over, start obfuscating...")

    X_obf_dict = {}

    for i in range(100):
        X_obf_dict[i], _ = funcs.get_obf_X(df_test, xpgg, pp)

    return X_obf_dict, X_ori, model_rf, model_xgb
コード例 #3
0
                                            row, col]

                        pd.DataFrame(xpgg).to_csv('xpgg.csv',
                                                  index=False,
                                                  header=None)
                        pd.DataFrame(pgy).to_csv('pgy_full.csv',
                                                 index=False,
                                                 header=None)
                        pd.DataFrame(JSD_Mat).to_csv('JSD_full.csv',
                                                     index=False,
                                                     header=None)

                        min_JSD_Mat = JSD_Mat
                        min_pgy = pgy
                        ### change age group by greedy approach
                        mean_Utility = funcs.Mean_JSD(JSD_Mat, xpgg)
                        mean_Privacy = funcs.Mean_KL_div(pgy, xpgg)
                        min_mean_Utility = mean_Utility
                        min_mean_Privacy = mean_Privacy

                        adjustable_groups, reducible_groups = age_group_adjust_greedy(
                            df_item_age_uid, group_age_dict, k_threshold,
                            np.log(l_threshold), beta, alpha)
                        min_group = 0
                        for i in adjustable_groups:
                            age_group_dict_cur = {}
                            for group, group_age_list in adjustable_groups[
                                    i].items():
                                for age in group_age_list:
                                    age_group_dict_cur[age] = group
コード例 #4
0
def YGen(df_train, age_group_number, cluster_num, age_list, age_group_dict,
         group_age_dict, df_item_age_uid, deltaX, k_threshold, l_threshold,
         pp):

    df_train_copy = copy.deepcopy(df_train)
    df_train_copy['age_group'] = pd.Series(np.zeros(df_train_copy.shape[0]),
                                           index=df_train_copy.index,
                                           dtype='int32')
    user_num = df_train_copy.shape[0]
    X_ori = {}
    for k in range(user_num):
        user_id = df_train_copy['uid'][k]
        X_ori[user_id] = df_train_copy[df_train_copy['uid'] == user_id].values[
            0, :-1]
    for k in X_ori.keys():
        user_age = X_ori[k][-2]
        X_ori[k][-3] = age_group_dict[user_age]

    xpgg = np.ones((cluster_num * age_group_number,
                    cluster_num * age_group_number)) * 0.00000001
    JSD_Mat = np.ones(
        (cluster_num * age_group_number, cluster_num * age_group_number))
    pgy = np.ones((len(age_list), cluster_num * age_group_number)) * 0.00000001

    group_min_age_dict = {}
    group_usersize_dict = {}

    age_xpgg_dict = {}
    JSD_Mat_dict = {}
    pgy_dict = {}

    for ag in range(age_group_number):
        group_min_age_dict[ag] = group_age_dict[ag][0]
        print(group_min_age_dict[ag])
        df_train_ag = df_train.loc[df_train['age_group'] == ag]
        age_list_ag = group_age_dict[ag]
        group_usersize_dict[ag] = df_train_ag.shape[0]

        JSD_Mat_dict[ag] = funcs.cal_JSD_Matrix_withoutAgeGroup(
            df_train_ag, cluster_num, 4)
        pgy_dict[ag] = funcs.cal_pgy_withoutAgeGroup(df_train_ag, cluster_num,
                                                     age_list_ag)
        # print(JSD_Mat_dict[ag].shape)
        # print(pgy_dict[ag].shape)
        pd.DataFrame(JSD_Mat_dict[ag]).to_csv('tmp/JSDM_ageGroup_ygen.csv',
                                              index=False,
                                              header=None)
        pd.DataFrame(pgy_dict[ag]).to_csv('tmp/pgy_ageGroup_ygen.csv',
                                          index=False,
                                          header=None)

        eng = matlab.engine.start_matlab()
        eng.edit('../../matlab/age_tradeoff_scenario_II/YGen', nargout=0)
        eng.cd('../../matlab/age_tradeoff_scenario_II', nargout=0)
        age_xpgg_dict[ag], distortion_budget = np.array(
            eng.YGen(deltaX, nargout=2))
        age_xpgg_dict[ag] = np.array(age_xpgg_dict[ag])

    for ag in range(age_group_number):
        for age in group_age_dict[ag]:
            for col in range(cluster_num):
                pgy[age - group_min_age_dict[0], ag + col * age_group_number] = pgy_dict[ag][
                                                                                    age - group_min_age_dict[
                                                                                        ag], col] * \
                                                                                group_usersize_dict[ag] / \
                                                                                df_train.shape[0]
    for ag in range(age_group_number):
        for row in range(cluster_num):
            for col in range(cluster_num):
                xpgg[ag + row * age_group_number,
                     ag + col * age_group_number] = age_xpgg_dict[ag][row, col]
                JSD_Mat[ag + row * age_group_number,
                        ag + col * age_group_number] = JSD_Mat_dict[ag][row,
                                                                        col]

    JSD_Mat = np.ones(
        (cluster_num * age_group_number, cluster_num * age_group_number))
    pgy = np.ones((len(age_list), cluster_num * age_group_number)) * 0.00000001
    group_min_age_dict = {}
    group_usersize_dict = {}

    JSD_Mat_dict = {}
    pgy_dict = {}

    for ag in range(age_group_number):
        group_min_age_dict[ag] = group_age_dict[ag][0]
        print(group_min_age_dict[ag])
        df_train_ag = df_train.loc[df_train['age_group'] == ag]
        age_list_ag = group_age_dict[ag]
        group_usersize_dict[ag] = df_train_ag.shape[0]

        JSD_Mat_dict[ag] = funcs.cal_JSD_Matrix_withoutAgeGroup(
            df_train_ag, cluster_num, 4)
        pgy_dict[ag] = funcs.cal_pgy_withoutAgeGroup(df_train_ag, cluster_num,
                                                     age_list_ag)

    for ag in range(age_group_number):
        for age in group_age_dict[ag]:
            for col in range(cluster_num):
                pgy[age - group_min_age_dict[0], ag + col * age_group_number] = pgy_dict[ag][
                                                                                    age -
                                                                                    group_min_age_dict[
                                                                                        ag], col] * \
                                                                                group_usersize_dict[ag] / \
                                                                                df_train.shape[0]

    for ag in range(age_group_number):
        for row in range(cluster_num):
            for col in range(cluster_num):
                # xpgg[ag + row * age_group_number, ag + col * age_group_number] = age_xpgg_dict[ag][row, col]
                JSD_Mat[ag + row * age_group_number,
                        ag + col * age_group_number] = JSD_Mat_dict[ag][row,
                                                                        col]

    min_JSD_Mat = JSD_Mat
    min_pgy = pgy
    ### change age group by greedy approach
    mean_Utility = funcs.Mean_JSD(JSD_Mat, xpgg)
    mean_Privacy = funcs.Mean_KL_div(pgy, xpgg)
    min_mean_Utility = mean_Utility
    min_mean_Privacy = mean_Privacy

    adjustable_groups, reducible_groups = funcs.age_group_adjust_greedy(
        df_item_age_uid, group_age_dict, k_threshold, np.log(l_threshold))
    min_group = 0
    print("start adjusting...")
    better_group_flag = 0
    for i in adjustable_groups:
        age_group_dict_cur = {}
        for group, group_age_list in adjustable_groups[i].items():
            for age in group_age_list:
                age_group_dict_cur[age] = group

        df_train_new = funcs.update_age_group(df_train, age_group_dict_cur)
        new_JSD_Mat = funcs.cal_JSD_Matrix_withAgeGroup(
            df_train_new, cluster_num, age_group_number, 4)
        new_pgy = funcs.cal_pgy_withAgeGroup(df_train_new, cluster_num,
                                             age_group_number, age_list)
        new_mean_Utility = funcs.Mean_JSD(new_JSD_Mat, xpgg)
        new_mean_Privacy = funcs.Mean_KL_div(new_pgy, xpgg)
        print(new_mean_Privacy)
        print(new_mean_Utility)
        if new_mean_Utility < min_mean_Utility and new_mean_Privacy < min_mean_Privacy:
            min_mean_Utility = new_mean_Utility
            min_mean_Privacy = new_mean_Privacy
            min_group_age_dict = copy.deepcopy(adjustable_groups[i])
            min_age_group_dict = copy.deepcopy(age_group_dict_cur)
            min_JSD_Mat = new_JSD_Mat
            min_pgy = new_pgy
            min_group = i
            print('Find better group!')
            better_group_flag = 1
            print(i, min_group, mean_Privacy, mean_Utility, min_mean_Privacy,
                  min_mean_Utility, new_mean_Privacy, new_mean_Utility)

    if better_group_flag == 1:
        age_group_dict = min_age_group_dict
        group_age_dict = min_group_age_dict
    else:
        print("find better group failed.")

    df_train = funcs.update_age_group(df_train, age_group_dict)

    # 使用得到的xpgg求解混淆后的df_train
    X_obf_dict = {}
    for i in range(25):
        X_obf_dict[i], _ = funcs.get_obf_X(df_train, xpgg, pp)

    return X_obf_dict, X_ori