data_pub, _ = randomKway(args.dataset_pub,
                             args.workload,
                             args.marginal,
                             seed=args.workload_seed,
                             proj=proj,
                             filter=filter_pub)
    N_pub = int(args.pub_frac * data_pub.df.shape[0])
    data_pub, A_init = get_pub_dataset(data_pub, args.pub_frac, args.frac_seed)

    print('workload: ', len(workloads))
    print('num queries: ', query_manager.num_queries)
    print('A:', A_init.shape)

    real_answers = query_manager.get_answer(data)
    query_manager.setup_query_attr(save_dir=save_dir_query)
    query_manager.setup_xy(data_pub, save_dir=save_dir_xy)
    fake_answers = query_manager.get_answer_weights(A_init)
    init_error = np.abs(real_answers - fake_answers).max()

    result_cols = {
        'marginal': args.marginal,
        'num_workloads': len(workloads),
        'workload_seed': args.workload_seed,
        'num_queries': query_manager.num_queries,
        'dataset_pub': args.dataset_pub,
        'state_pub': args.state_pub,
        'pub_frac': args.pub_frac,
        'frac_seed': args.frac_seed,
        'priv_size': N,
        'pub_size': N_pub,
    }
Esempio n. 2
0
        df_support = data_support.df
        prng = np.random.RandomState(args.support_seed)
        idxs = prng.choice(df_support.index.values, size=args.support_size, replace=False)
        df_support = df_support.loc[idxs].reset_index(drop=True)
        data_support = Dataset(df_support, data_support.domain)
        A_init = np.ones(len(df_support))
        A_init = A_init / len(A_init)

    print('workload: ', len(workloads))
    print('num queries: ', query_manager.num_queries)
    print('A:', A_init.shape)

    # get answers and initial error
    real_answers = query_manager.get_answer(data, concat=False)
    query_manager.setup_query_attr(save_dir=save_dir_query)
    query_manager.setup_xy(data_support, save_dir=save_dir_xy)
    fake_answers = query_manager.get_answer_weights(A_init, concat=False)
    init_errors = util.get_errors(real_answers, fake_answers)

    delta = 1.0 / N ** 2
    rho = cdp_rho(args.epsilon, delta)
    eps0 = (2 * rho) ** 0.5 / (2 * args.T) ** 0.5

    result_cols = {'adult_seed': [args.adult_seed],
                   'marginal': [args.marginal],
                   'num_workloads': [len(workloads)],
                   'workload_seed': [args.workload_seed],
                   'num_queries': [query_manager.num_queries],
                   'priv_size': [N],
                   'support_size': [args.support_size],
                   'support_seed': [args.support_seed]
Esempio n. 3
0
    filter_private, filter_pub = get_filters(args)

    data, workloads = randomKway(args.dataset,
                                 args.workload,
                                 args.marginal,
                                 seed=args.workload_seed,
                                 proj=proj,
                                 filter=filter_private)
    query_manager = QueryManager(data.domain, workloads)
    N = data.df.shape[0]

    data_support, A_init = get_support(data)

    # get answers and initial error
    real_answers = query_manager.get_answer(data)
    query_manager.setup_xy(data_support)
    fake_answers = query_manager.get_answer_weights(A_init)
    init_error = np.abs(real_answers - fake_answers).max()

    A_avg, A_last = generate_nondp(data_support,
                                   real_answers,
                                   A_init,
                                   query_manager,
                                   early_stopping=args.early_stopping,
                                   return_last=True)

    fake_answers = query_manager.get_answer_weights(A_avg)
    max_error_avg = np.abs(real_answers - fake_answers).max()

    fake_answers = query_manager.get_answer_weights(A_last)
    max_error_last = np.abs(real_answers - fake_answers).max()
Esempio n. 4
0
pd_states = pd.read_csv('Datasets/{}.csv'.format(args.dataset))
states = pd_states['STATE'].unique()
del pd_states

errors = []
for state in states:
    filter = ('STATE', state)
    data_pub, _ = randomKway(args.dataset,
                             args.workload,
                             args.marginal,
                             seed=args.workload_seed,
                             proj=proj,
                             filter=filter)
    data_pub, A_init = get_pub_dataset(data_pub, 1.0, 0)
    query_manager.setup_xy(data_pub)
    fake_answers = query_manager.get_answer_weights(A_init)

    error = np.abs(real_answers - fake_answers)
    max_error = error.max()
    avg_error = error.mean()
    errors.append({
        'state': state,
        'max_error': max_error,
        'avg_error': avg_error
    })
    print(state, max_error, avg_error)

df = pd.DataFrame(errors).sort_values('max_error').reset_index(drop=True)
df['workload'] = args.workload
df['marginal'] = args.marginal