Esempio n. 1
0
def main():
    dataset = 'adult'
    workload = 8
    marginal = 3

    data, workloads = benchmarks.randomKway(dataset, workload, marginal)
    N = data.df.shape[0]

    ######################################################
    ## Get Queries
    ######################################################
    stime = time.time()
    query_manager = QueryManager(data.domain, workloads)
    print("Number of queries = ", len(query_manager.queries))

    W_p = query_manager.get_query_workload([1, 4, 7, 100])
    W_n = query_manager.get_query_workload([2, 5, 10,
                                            11])  # q_neg(D) = 1 - q(D)
    D = W_p.shape[1]
    noise = np.random.exponential(1, D)
    # noise = np.zeros( D)

    print(f'noise.shape = {noise.shape}')

    x = oracle_weighted.solve(W_p, np.ones(4), W_n, np.ones(4), noise,
                              data.domain, 0)
    print(f'best  score = {get_score(x, W_p, W_n, noise, marginal)}')
Esempio n. 2
0
File: fem.py Progetto: giusevtr/fem
def generate(real_answers: np.array,
             N: int,
             domain: Domain,
             query_manager: QueryManager,
             epsilon: float,
             delta: float,
             epsilon_split: float,
             noise_multiple: float,
             samples: int,
             alpha=0,
             show_prgress=True):
    assert epsilon_split > 0
    assert noise_multiple > 0
    neg_real_answers = 1 - real_answers
    D = np.sum(domain.shape)
    Q_size = query_manager.num_queries

    prev_queries = []
    neg_queries = []

    final_oh_fake_data = []  # stores the final data
    '''
    Calculate the total number of rounds using advance composition
    '''
    T, epsilon_0 = get_iters(epsilon, delta, epsilon_split)

    # print(f'epsilon_0 = {epsilon_0}')
    exponential_scale = np.sqrt(T) * noise_multiple
    # print(f'epsilon_0 = {epsilon_0}')
    if show_prgress: progress_bar = tqdm(total=T)
    for t in range(T):
        """
        Sample s times from FTPL
        """
        util2.blockPrint()
        num_processes = 8
        s2 = int(1.0 + samples / num_processes)
        samples_rem = samples
        processes = []
        manager = mp.Manager()
        fake_temp = manager.list()

        query_workload, q_weights = query_manager.get_query_workload_weighted(
            prev_queries)
        neg_query_workload, n_weights = query_manager.get_query_workload_weighted(
            neg_queries)

        for __ in range(num_processes):
            temp_s = samples_rem if samples_rem - s2 < 0 else s2
            samples_rem -= temp_s
            noise = np.random.exponential(exponential_scale, (temp_s, D))
            proc = mp.Process(target=gen_fake_data,
                              args=(fake_temp, query_workload, q_weights,
                                    neg_query_workload, n_weights, noise,
                                    domain, alpha, temp_s))

            proc.start()
            processes.append(proc)

        assert samples_rem == 0, "samples_rem = {}".format(samples_rem)
        for p in processes:
            p.join()

        util2.enablePrint()
        oh_fake_data = []
        assert len(fake_temp) > 0
        for x in fake_temp:
            oh_fake_data.append(x)
            final_oh_fake_data.append(x)

        assert len(oh_fake_data
                   ) == samples, "len(D_hat) = {} len(fake_data_ = {}".format(
                       len(oh_fake_data), len(fake_temp))
        for i in range(samples):
            assert len(oh_fake_data[i]) == D, "D_hat dim = {}".format(
                len(oh_fake_data[0]))
        # assert not final_oh_fake_data or len(final_oh_fake_data[0][1]) == D, "D_hat dim = {}".format(len(oh_fake_data[0]))
        fake_data = Dataset(
            pd.DataFrame(util2.decode_dataset(oh_fake_data, domain),
                         columns=domain.attrs), domain)
        """
        Compute Exponential Mechanism distribution
        """
        fake_answers = query_manager.get_answer(fake_data)
        neg_fake_answers = 1 - fake_answers

        score = np.append(real_answers - fake_answers,
                          neg_real_answers - neg_fake_answers)

        EM_dist_0 = np.exp(epsilon_0 * score * N / 2, dtype=np.float128)
        sum = np.sum(EM_dist_0)
        assert sum > 0
        assert not np.isinf(sum)
        EM_dist = EM_dist_0 / sum
        assert not np.isnan(
            EM_dist).any(), "EM_dist_0 = {} EM_dist = {} sum = {}".format(
                EM_dist_0, EM_dist, sum)
        assert not np.isinf(
            EM_dist).any(), "EM_dist_0 = {} EM_dist = {} sum = {}".format(
                EM_dist_0, EM_dist, sum)
        """
        Sample from EM
        """
        q_t_ind = util2.sample(EM_dist)

        if q_t_ind < Q_size:
            prev_queries.append(q_t_ind)
        else:
            neg_queries.append(q_t_ind - Q_size)

        if show_prgress:
            progress_bar.update()
            progress_bar.set_postfix({
                'max error': f'{np.max(score):.3f}',
                'round error': f'{score[q_t_ind]:.3f}'
            })

    if show_prgress: progress_bar.close()

    final_fem_data = Dataset(
        pd.DataFrame(util2.decode_dataset(final_oh_fake_data, domain),
                     columns=domain.attrs), domain)
    return final_fem_data
Esempio n. 3
0
File: fem.py Progetto: giusevtr/fem
    print("=============================================")
    print(vars(args))

    ######################################################
    ## Get dataset
    ######################################################
    data, workloads = benchmarks.randomKway(args.dataset[0], args.workload[0],
                                            args.marginal[0])
    N = data.df.shape[0]
    delta = 1.0 / N**2

    ######################################################
    ## Get Queries
    ######################################################
    stime = time.time()
    query_manager = QueryManager(data.domain, workloads)
    print("Number of queries = ", len(query_manager.queries))

    real_ans = query_manager.get_answer(data)

    res = []

    for eps in args.epsilon:
        ######################################################
        ## Generate synthetic data with eps
        ######################################################
        fem_start = time.time()
        fem_data = generate(real_answers=real_ans,
                            N=N,
                            domain=data.domain,
                            query_manager=query_manager,
    proj = get_proj(args.dataset)
    if args.dataset.endswith('-small'):
        if args.dataset.startswith('acs'):
            args.dataset = args.dataset[:-6]
            args.dataset_pub = args.dataset_pub[:-6]

    filter_private, filter_pub = get_filters(args)

    data, workloads = randomKway(args.dataset,
                                 args.workload,
                                 args.marginal,
                                 seed=args.workload_seed,
                                 proj=proj,
                                 filter=filter_private)
    query_manager = QueryManager(data.domain, workloads)
    N = data.df.shape[0]

    data_pub, _ = randomKway(args.dataset_pub,
                             args.workload,
                             args.marginal,
                             seed=args.workload_seed,
                             proj=proj,
                             filter=filter_pub)
    N_pub = int(args.pub_frac * data_pub.df.shape[0])
    data_pub, A_init = get_pub_dataset(data_pub, args.pub_frac, args.frac_seed)

    print('workload: ', len(workloads))
    print('num queries: ', query_manager.num_queries)
    print('A:', A_init.shape)
Esempio n. 5
0
    parser.add_argument('marginal', type=int, nargs=1, help='queries')
    parser.add_argument('samples', type=int, nargs=1, help='hyperparameter')
    parser.add_argument('epsilon',
                        type=float,
                        nargs='+',
                        help='Privacy parameter')
    args = parser.parse_args()
    eps = args.epsilon[0]

    print("=============================================")
    print(vars(args))
    data, workloads = benchmarks.randomKway(args.dataset[0], args.workload[0],
                                            args.marginal[0])
    N = data.df.shape[0]
    stime = time.time()
    query_manager = QueryManager(data.domain, workloads)
    print("Number of queries = ", len(query_manager.queries))
    print("epsilon = ", eps, "=========>")

    #     random_search(data=data,
    #                   query_manager=query_manager,
    #                   epsilon=eps,
    #                   samples=args.samples[0],
    #                   max_iter=10,
    #                   timeout=300)

    RS_grid_search(data=data,
                   query_manager=query_manager,
                   samples=args.samples[0],
                   max_iter=50,
                   timeout=300)
Esempio n. 6
0
        os.makedirs(results_dir)

    proj = get_proj(args.dataset)
    if args.dataset.endswith('-small'):
        if args.dataset.startswith('acs'):
            args.dataset = args.dataset[:-6]

    filter_private, filter_pub = get_filters(args)

    data, workloads = randomKway(args.dataset,
                                 args.workload,
                                 args.marginal,
                                 seed=args.workload_seed,
                                 proj=proj,
                                 filter=filter_private)
    query_manager = QueryManager(data.domain, workloads)
    N = data.df.shape[0]

    data_support, A_init = get_support(data)

    # get answers and initial error
    real_answers = query_manager.get_answer(data)
    query_manager.setup_xy(data_support)
    fake_answers = query_manager.get_answer_weights(A_init)
    init_error = np.abs(real_answers - fake_answers).max()

    A_avg, A_last = generate_nondp(data_support,
                                   real_answers,
                                   A_init,
                                   query_manager,
                                   early_stopping=args.early_stopping,
Esempio n. 7
0
    if args.support_size is not None:
        save_dir_xy = save_dir_xy + 'mwem/{}'.format(args.support_size)
    for d in [save_dir_query, save_dir_xy]:
        if not os.path.exists(d):
            os.makedirs(d)

    proj = get_proj(args.dataset)
    if args.dataset.endswith('-small'):
        args.dataset = args.dataset[:-6]

    filter_private, filter_pub = get_filters(args)

    data, workloads = randomKway(args.dataset, args.workload, args.marginal, seed=args.workload_seed, proj=proj,
                                 filter=filter_private, args=args)

    query_manager = QueryManager(data.domain, workloads)
    N = data.df.shape[0]
    print(f'dim = {sum([p for p in data.domain.shape])}')

    data_support, A_init = get_support(data)
    if args.support_size is not None:
        df_support = data_support.df
        prng = np.random.RandomState(args.support_seed)
        idxs = prng.choice(df_support.index.values, size=args.support_size, replace=False)
        df_support = df_support.loc[idxs].reset_index(drop=True)
        data_support = Dataset(df_support, data_support.domain)
        A_init = np.ones(len(df_support))
        A_init = A_init / len(A_init)

    print('workload: ', len(workloads))
    print('num queries: ', query_manager.num_queries)
Esempio n. 8
0
    parser.add_argument('epsilon',
                        type=float,
                        nargs='+',
                        help='Privacy parameter')
    parser.add_argument('--nave', type=int, default=1, help='Number of runs')
    args = parser.parse_args()
    print(vars(args))

    # Get dataset
    data, workloads = benchmarks.randomKway(args.dataset[0], args.workload[0],
                                            args.marginal[0])
    N = data.df.shape[0]

    # Get Queries
    stime = time.time()
    query_manager = QueryManager(data.domain, workloads)
    print("Number of queries = ", len(query_manager.queries))
    print('computing real answers...', end='')
    query_manager.real_answers = query_manager.get_answer(data)
    print('Done!')
    final_df = None
    for eps in args.epsilon:
        print("epsilon = ", eps, "=========>")
        # Generate synthetic data with eps
        start_time = time.time()
        df = fem_grid_search(data,
                             eps,
                             query_manager,
                             n_ave=args.nave,
                             timeout=300)
        elapsed_time = time.time() - start_time
Esempio n. 9
0
 def setup_class(self):
     self.domain = Domain(('A', 'B', 'C', 'D'), [3, 3, 3])
     self.workloads = [('A', 'C')]
     self.query_manager = QueryManager(self.domain, self.workloads)
     self.data = get_dummy_data2(self.domain, 50, self.query_manager)
Esempio n. 10
0
    os.makedirs(results_dir)

proj = get_proj(args.dataset)
if args.dataset.endswith('-small'):
    if args.dataset.startswith('acs'):
        args.dataset = args.dataset[:-6]
        args.dataset_pub = args.dataset_pub[:-6]

filter = ('STATE', args.state)
data, workloads = randomKway(args.dataset,
                             args.workload,
                             args.marginal,
                             seed=args.workload_seed,
                             proj=proj,
                             filter=filter)
query_manager = QueryManager(data.domain, workloads)
N = data.df.shape[0]

real_answers = query_manager.get_answer(data, debug=False)
query_manager.setup_query_attr()

pd_states = pd.read_csv('Datasets/{}.csv'.format(args.dataset))
states = pd_states['STATE'].unique()
del pd_states

errors = []
for state in states:
    filter = ('STATE', state)
    data_pub, _ = randomKway(args.dataset,
                             args.workload,
                             args.marginal,
Esempio n. 11
0
    ## Get dataset
    ######################################################
    data, workloads = benchmarks.randomKway(args.dataset[0], args.workload[0],
                                            args.marginal[0])
    N = data.df.shape[0]

    # print("True answers: ")
    # for proj in workloads:
    #     dp = data.project(proj).datavector()
    #     print(dp[:10])
    # print("============")
    ######################################################
    ## Get Queries
    ######################################################
    stime = time.time()
    query_manager = QueryManager(data.domain, workloads)
    print("Number of queries = ", len(query_manager.queries))

    for eps in args.epsilon:
        print("epsilon = ", eps, "=========>")
        ######################################################
        ## Generate synthetic data with eps
        ######################################################
        start_time = time.time()
        syndata, status = generate(data=data,
                                   query_manager=query_manager,
                                   epsilon=eps,
                                   epsilon_0=args.eps0[0],
                                   exponential_scale=args.noise[0],
                                   adaptive=args.adaptive[0],
                                   samples=args.samples[0],