def get_pub_dataset_biased(data, pub_frac, frac_seed, bias_attr, perturb): prng = np.random.RandomState(frac_seed) df_priv = data.df N = data.df.shape[0] attr_distr = np.bincount(df_priv[bias_attr]) attr_distr = attr_distr / attr_distr.sum() orig_attr_distr = attr_distr.copy() attr_distr[0] += perturb attr_distr[1] = 1 - attr_distr[0] df_pub = [] for i in range(attr_distr.shape[0]): mask = df_priv[bias_attr] == i df_attr = df_priv[mask].reset_index(drop=True) size = int(pub_frac * N * attr_distr[i]) idxs = prng.choice(df_attr.index, size=size, replace=True) df_pub.append(df_attr.loc[idxs]) df_pub = pd.concat(df_pub).reset_index(drop=True) cols = list(df_pub.columns) df_pub = df_pub.reset_index().groupby(cols).count() df_pub.reset_index(inplace=True) A_init = df_pub['index'].values A_init = A_init / A_init.sum() data_pub = Dataset(df_pub, data.domain) return data_pub, A_init, orig_attr_distr
def privbayes_inference(domain, measurements, total): synthetic = pd.DataFrame() _, y, _, proj = measurements[0] y = np.maximum(y, 0) y /= y.sum() col = proj[0] synthetic[col] = np.random.choice(domain[col], total, True, y) for _, y, _, proj in measurements[1:]: # find the CPT col, dep = proj[0], proj[1:] print(col) y = np.maximum(y, 0) dom = domain.project(proj) cpt = Factor(dom, y.reshape(dom.shape)) marg = cpt.project(dep) cpt /= marg cpt2 = np.moveaxis(cpt.project(proj).values, 0, -1) # sample current column synthetic[col] = 0 rng = itertools.product(*[range(domain[a]) for a in dep]) for v in rng: idx = (synthetic.loc[:,dep].values == np.array(v)).all(axis=1) p = cpt2[v].flatten() if p.sum() == 0: p = np.ones(p.size) / p.size n = domain[col] N = idx.sum() if N > 0: synthetic.loc[idx,col] = np.random.choice(n, N, True, p) return Dataset(synthetic, domain)
def max_sum_ve(factors, domain=None, elim=None): """ run max-product variable elimination on the factors return the most likely assignment as a dictionary where keys are attributes values are elements of the domain """ # step 0: choose an elimination order if domain is None: domain = reduce(Domain.merge, [F.domain for F in factors]) if elim is None: cliques = [F.domain.attrs for F in factors] elim = graphical_model.greedy_order(domain, cliques, domain.attrs) # step 1: variable elimination k = len(factors) phi = dict(zip(range(k), factors)) psi = {} for z in elim: phi2 = [phi.pop(i) for i in list(phi.keys()) if z in phi[i].domain] psi[z] = sum(phi2, Factor.ones(domain.project(z))) phi[k] = psi[z].max([z]) k += 1 value = phi[k - 1] # step 2: traceback-MAP x = {} for z in reversed(elim): x[z] = psi[z].condition(x).values.argmax() # step 3 convert to a Dataset object df = pd.DataFrame(x, index=[0]) return Dataset(df, domain)
def DualQuery(data, workload, eps=1.0, delta=0.001, seed=0): prng = np.random.RandomState(seed) total = data.df.shape[0] domain = data.domain answers = answer_workload(workload, data) / total nu = 2.0 s = 50 #T = int(0.5 * ( np.sqrt(4 * eps * total + s * nu) / np.sqrt(s*nu) + 1 )) T = 2 while 2 * nu * (T - 1) / total * ( np.sqrt(2 * s * (T - 1) * np.log(1.0 / delta) + s * (T - 1) * np.exp(2 * nu * (T - 1) / total) - 1)) < eps: T = T + 1 T = T - 1 Qsize = sum(W.shape[0] for _, W in workload) Xsize = data.domain.size() Q = np.ones(Qsize) / Qsize cache = [] #lookup = [Factor(domain.project(cl), q) for cl, W in workload for q in W] lookup = [(cl, W, i) for cl, W in workload for i in range(W.shape[0])] results = [] for i in range(T): idx = prng.choice(Qsize, s, True, Q) #queries = [lookup[i] for i in idx] queries = [] for i in idx: cl, W, e = lookup[i] dom = domain.project(cl) n = W.shape[0] z = np.zeros(n) z[e] = 1.0 q = W.T.dot(z) queries.append(Factor(dom, -q)) best = max_sum_ve(queries, data.domain) curr = answer_workload(workload, best) Q *= np.exp(-nu * (answers - curr)) Q /= Q.sum() cache.append((idx, curr)) results.append(best.df) synthetic = Dataset(pd.concat(results), data.domain) print('Iterations', T) print('Privacy level', nu * T * (T - 1) * s / total) delta = 1e-3 eps = 2 * nu * (T - 1) / total * ( np.sqrt(2 * s * (T - 1) * np.log(1.0 / delta) + s * (T - 1) * np.exp(2 * nu * (T - 1) / total) - 1)) print('Approx privacy level', eps, delta) return synthetic, cache
def randomKway(name, number, marginal, proj=None, seed=0, filter=None, root_path='./', args=None): check_size = name in ['adult_orig', 'loans'] path = os.path.join(root_path, "Datasets/{}.csv".format(name)) df = pd.read_csv(path) domain = os.path.join(root_path, "Datasets/{}-domain.json".format(name)) config = json.load(open(domain)) domain = Domain(config.keys(), config.values()) if name == 'adult': if args.adult_seed is not None: prng = np.random.RandomState(args.adult_seed) mask = prng.binomial(1, 0.9, size=len(df)) df.loc[:, '_split'] = mask else: df.loc[:, '_split'] = 1 if filter is not None: col, val = filter df = df[df[col] == val].reset_index(drop=True) del df[col] domain_max = max(domain.config.values()) dtype = get_min_dtype(domain_max) df = df.astype(dtype) data = Dataset(df, domain) if proj is not None: data = data.project(proj) return data, randomKwayData(data, number, marginal, seed, check_size=check_size)
def get_support(data): df_support = [] for val in list(data.domain.config.values()): df_support.append(np.arange(val)) df_support = list(itertools.product(*df_support)) df_support = np.array(df_support) df_support = pd.DataFrame(df_support, columns=data.df.columns) data_support = Dataset(df_support, data.domain) A_init = np.ones(len(df_support)) A_init /= len(A_init) return data_support, A_init
def load_data(self, path=None): """ load the data and discretize the integer/float attributes """ if path is None: path = self.dataset df = pd.read_csv(path) self.column_order = df.columns for col in self.domain_info: vals = self.domain_info[col] mapping = dict(zip(vals, range(len(vals)))) df[col] = df[col].map(mapping) mapping = { k : k // 100 for k in range(5000) } mapping[999998] = 51 mapping.update({ i : 50 for i in range(5000, 999998) }) df['INCWAGE_A'] = df['INCWAGE'].map(mapping) mod_mapping = { k : 0 for k in range(5000, 999999) } for i in range(5001): if i % 100 == 0: mod_mapping[i] = 0 elif i % 20 == 0: mod_mapping[i] = 1 elif i % 50 == 0: mod_mapping[i] = 2 elif i % 25 == 0: mod_mapping[i] = 3 elif i % 10 == 0: mod_mapping[i] = 4 elif i % 5 == 0: mod_mapping[i] = 5 elif i % 2 == 0: mod_mapping[i] = 6 else: mod_mapping[i] = 7 df['INCWAGE_B'] = df['INCWAGE'].map(mod_mapping) mapping = {} for i in range(9999998): if i <= 25000: mapping[i] = i // 5 else: mapping[i] = 5000 mapping[9999998] = 5001 mapping[9999999] = 5002 df['VALUEH'] = df['VALUEH'].map(mapping) return Dataset(df, self.domain)
def get_A_init(data, df): cols = list(df.columns) df = df.groupby(cols).size().reset_index(name='Count') A_init = df['Count'].values A_init = A_init / A_init.sum() del df['Count'] data_pub = Dataset(df, data.domain) # A_init = df.groupby(cols, sort=False).size().values # A_init = A_init / A_init.sum() # df = df.drop_duplicates() # data_pub = Dataset(df, data.domain) return data_pub, A_init
def synthetic_data(self, rows=None): """ Generate synthetic tabular data from the distribution """ total = int(self.total) if rows is None else rows cols = self.domain.attrs data = np.zeros((total, len(cols)), dtype=int) df = pd.DataFrame(data, columns=cols) cliques = [set(cl) for cl in self.cliques] def synthetic_col(counts, total): counts *= total / counts.sum() frac, integ = np.modf(counts) integ = integ.astype(int) extra = total - integ.sum() #if extra > 0: # o = np.argsort(frac) # integ[o[-extra:]] += 1 if extra > 0: idx = np.random.choice(counts.size, extra, False, frac / frac.sum()) integ[idx] += 1 vals = np.repeat(np.arange(counts.size), integ) np.random.shuffle(vals) return vals order = self.elimination_order[::-1] col = order[0] marg = self.project([col]).datavector(flatten=False) df.loc[:, col] = synthetic_col(marg, total) used = {col} for col in order[1:]: relevant = [cl for cl in cliques if col in cl] relevant = used.intersection(set.union(*relevant)) proj = tuple(relevant) used.add(col) marg = self.project(proj + (col, )).datavector(flatten=False) def foo(group): idx = group.name vals = synthetic_col(marg[idx], group.shape[0]) group[col] = vals return group if len(proj) >= 1: df = df.groupby(list(proj)).apply(foo) else: df[col] = synthetic_col(marg, df.shape[0]) return Dataset(df, self.domain)
def reverse_data(data, supports): df = data.df.copy() newdom = {} for col in data.domain: support = supports[col] mx = support.sum() newdom[col] = int(support.size) idx, extra = np.where(support)[0], np.where(~support)[0] mask = df[col] == mx if extra.size == 0: pass else: df.loc[mask, col] = np.random.choice(extra, mask.sum()) df.loc[~mask, col] = idx[df.loc[~mask, col]] newdom = Domain.fromdict(newdom) return Dataset(df, newdom)
def get_dummy_data(domain, data_size, query_manager=None): dis = {} for attr, n in zip(domain.attrs, domain.shape): random_dist = np.random.exponential(10, n) random_dist = random_dist / np.sum(random_dist) dis[attr] = random_dist arr = [np.random.choice(n, data_size, p=dis[attr]) for attr, n in zip(domain.attrs, domain.shape)] values = np.array(arr).T df = pd.DataFrame(values, columns=domain.attrs) data = Dataset(df, domain) if query_manager is not None: ans = query_manager.get_answer(data) print("max answer: ", np.max(ans)) plt.hist(ans) plt.show() return data
def transform_data(data, supports): df = data.df.copy() newdom = {} for col in data.domain: support = supports[col] size = support.sum() newdom[col] = int(size) if size < support.size: newdom[col] += 1 mapping = {} idx = 0 for i in range(support.size): mapping[i] = size if support[i]: mapping[i] = idx idx += 1 assert idx == size df[col] = df[col].map(mapping) newdom = Domain.fromdict(newdom) return Dataset(df, newdom)
def get_pub_dataset_corrupt(data, pub_frac, frac_seed, perturb, perturb_seed, asymmetric=False): prng_frac = np.random.RandomState(frac_seed) prng_perturb = np.random.RandomState(perturb_seed) df_pub = data.df.copy() pub_data_size = int(pub_frac * df_pub.shape[0]) idxs = prng_frac.choice(df_pub.index, size=pub_data_size, replace=False) df_pub = df_pub.loc[idxs].reset_index(drop=True) mask = prng_perturb.binomial(1, p=perturb, size=df_pub.shape).astype(bool) domain = data.domain for i, attr in enumerate(df_pub.columns): mask_attr = mask[:, i] if asymmetric: perturbation = 1 else: perturbation = prng_perturb.choice(np.arange(1, domain[attr]), size=mask_attr.sum(), replace=True) df_pub.loc[mask_attr, attr] += perturbation df_pub.loc[mask_attr, attr] %= domain[attr] cols = list(df_pub.columns) df_pub = df_pub.groupby(cols).size().reset_index(name='Count') A_init = df_pub['Count'].values A_init = A_init / A_init.sum() data_pub = Dataset(df_pub, data.domain) return data_pub, A_init
def adult_benchmark(): data = Dataset.load('../data/adult.csv', '../data/adult-domain.json') projections = [('occupation', 'race', 'capital-loss'), ('occupation', 'sex', 'native-country'), ('marital-status', 'relationship', 'income>50K'), ('age', 'education-num', 'sex'), ('workclass', 'education-num', 'occupation'), ('marital-status', 'occupation', 'income>50K'), ('race', 'native-country', 'income>50K'), ('occupation', 'capital-gain', 'income>50K'), ('marital-status', 'hours-per-week', 'income>50K'), ('workclass', 'race', 'capital-gain'), ('marital-status', 'relationship', 'capital-gain'), ('workclass', 'education-num', 'capital-gain'), ('education-num', 'relationship', 'race'), ('fnlwgt', 'hours-per-week', 'income>50K'), ('workclass', 'sex', 'native-country')] lookup = {} for attr in data.domain: n = data.domain.size(attr) lookup[attr] = workload.Identity(n) lookup['age'] = workload.Prefix(85) lookup['fnlwgt'] = workload.Prefix(100) lookup['capital-gain'] = workload.Prefix(100) lookup['capital-loss'] = workload.Prefix(100) lookup['hours-per-week'] = workload.Prefix(99) workloads = [] for proj in projections: W = workload.Kronecker([lookup[a] for a in proj]) workloads.append((proj, W)) return data, workloads
def get_dummy_data2(domain, data_size, query_manager, display=False): num_attr = len(domain.attrs) bag = {} for i in range(len(query_manager.workloads)): if len(bag) >= num_attr//2:break for attr in query_manager.workloads[i]: id = query_manager.att_id[attr] if id not in bag: attr_size = domain.shape[id] bag[id] = np.random.randint(0, attr_size) arr = [] for _ in range(data_size): arr.append(get_dummy_row(domain, bag)) values = np.array(arr) df = pd.DataFrame(values, columns=domain.attrs) data = Dataset(df, domain) if display: ans = query_manager.get_answer(data) print("max answer: ", np.max(ans)) plot_bins(ans, title='Dummy') return data
def generate(data, query_manager, epsilon, epsilon_0, exponential_scale, adaptive, samples, alpha=0, timeout=None, show_prgress=True): domain = data.domain D = np.sum(domain.shape) N = data.df.shape[0] Q_size = query_manager.num_queries delta = 1.0 / N**2 beta = 0.05 ## Fail probability prev_queries = [] neg_queries = [] rho_comp = 0.0000 q1 = util2.sample(np.ones(Q_size) / Q_size) q2 = util2.sample(np.ones(Q_size) / Q_size) prev_queries.append(q1) ## Sample a query from the uniform distribution neg_queries.append(q2) ## Sample a query from the uniform distribution real_answers = query_manager.get_answer(data, debug=False) neg_real_answers = 1 - real_answers final_syn_data = [] fem_start_time = time.time() temp = [] # T = util.get_rounds(epsilon, epsilon_0, delta) T = util2.get_rounds_zCDP(epsilon, epsilon_0, adaptive, delta) if show_prgress: progress_bar = tqdm(total=T) status = 'OK' for t in range(T): eps_t = epsilon_0 + adaptive * t if show_prgress: progress_bar.update() """ End early after timeout seconds """ if (timeout is not None) and time.time() - fem_start_time > timeout: status = 'Timeout' break if (timeout is not None ) and t >= 1 and (time.time() - fem_start_time) * T / t > timeout: status = 'Ending Early ({:.2f}s) '.format( (time.time() - fem_start_time) * T / t) break """ Sample s times from FTPL """ util2.blockPrint() num_processes = 8 s2 = int(1.0 + samples / num_processes) samples_rem = samples processes = [] manager = mp.Manager() fake_temp = manager.list() query_workload = query_manager.get_query_workload(prev_queries) neg_query_workload = query_manager.get_query_workload(neg_queries) for __ in range(num_processes): temp_s = samples_rem if samples_rem - s2 < 0 else s2 samples_rem -= temp_s noise = np.random.exponential(exponential_scale, (temp_s, D)) proc = mp.Process(target=gen_fake_data, args=(fake_temp, query_workload, neg_query_workload, noise, domain, alpha, temp_s)) proc.start() processes.append(proc) assert samples_rem == 0, "samples_rem = {}".format(samples_rem) for p in processes: p.join() util2.enablePrint() oh_fake_data = [] assert len(fake_temp) > 0 for x in fake_temp: oh_fake_data.append(x) temp.append(x) # if current_eps >= epsilon / 2: ## this trick haves the final error # if t >= T / 2: ## this trick haves the final error final_syn_data.append(x) assert len(oh_fake_data ) == samples, "len(D_hat) = {} len(fake_data_ = {}".format( len(oh_fake_data), len(fake_temp)) for i in range(samples): assert len(oh_fake_data[i]) == D, "D_hat dim = {}".format( len(oh_fake_data[0])) assert not final_syn_data or len( final_syn_data[0]) == D, "D_hat dim = {}".format( len(oh_fake_data[0])) fake_data = Dataset( pd.DataFrame(util2.decode_dataset(oh_fake_data, domain), columns=domain.attrs), domain) """ Compute Exponential Mechanism distribution """ fake_answers = query_manager.get_answer(fake_data, debug=False) neg_fake_answers = 1 - fake_answers score = np.append(real_answers - fake_answers, neg_real_answers - neg_fake_answers) EM_dist_0 = np.exp(eps_t * score * N / 2, dtype=np.float128) sum = np.sum(EM_dist_0) assert sum > 0 and not np.isinf(sum) EM_dist = EM_dist_0 / sum assert not np.isnan( EM_dist).any(), "EM_dist_0 = {} EM_dist = {} sum = {}".format( EM_dist_0, EM_dist, sum) assert not np.isinf( EM_dist).any(), "EM_dist_0 = {} EM_dist = {} sum = {}".format( EM_dist_0, EM_dist, sum) """ Sample from EM """ q_t_ind = util2.sample(EM_dist) if q_t_ind < Q_size: prev_queries.append(q_t_ind) else: neg_queries.append(q_t_ind - Q_size) if len(final_syn_data) == 0: status = status + '---syn data.' fake_data = Dataset.synthetic(domain, 100) else: if status == 'OK': # Return top halve final_syn_data = np.array(final_syn_data) final_syn_data = final_syn_data[T // 2:, :] fake_data = Dataset( pd.DataFrame(util2.decode_dataset(final_syn_data, domain), columns=domain.attrs), domain) if show_prgress: progress_bar.close() return fake_data, status
help='bounded or unbounded privacy definition') parser.add_argument('--frequency', type=int, help='logging frequency') parser.add_argument('--seed', type=int, help='random seed') parser.add_argument('--save', type=str, help='path to save results') parser.add_argument('--load', type=str, help='path to load results from (skips experiment)') parser.add_argument('--plot', type=str, help='path to save plot') parser.set_defaults(**default_params()) args = parser.parse_args() if args.load: results = pickle.load(open(args.load, 'rb')) else: data = Dataset.load('../data/adult.csv', '../data/adult-domain.json') projections = [['race', 'capital-loss', 'income>50K'], ['marital-status', 'capital-gain', 'income>50K'], ['race', 'native-country', 'income>50K'], ['workclass', 'sex', 'hours-per-week'], ['fnlwgt', 'marital-status', 'relationship'], ['workclass', 'education-num', 'occupation'], ['age', 'relationship', 'sex'], ['occupation', 'sex', 'hours-per-week'], ['occupation', 'relationship', 'income>50K']] measurements = [] for p in projections: Q = sparse.eye(data.domain.size(p)) measurements.append((p, Q))
from hdmm.templates import DefaultKron, Marginals, DefaultUnionKron from hdmm import workload from mbi import FactoredInference, Domain, Dataset import numpy as np from IPython import embed # set up domain and workload attributes = [ 'A', 'B', 'C' ] # should be the names of the columns, for now just using 0 and 1 sizes = [32, 32, 32] dom = Domain(attributes, sizes) #W = workload.Prefix2D(32) W = workload.DimKMarginals(sizes, 1) data = Dataset.synthetic(dom, 1000) # optimize strategy using HDMM #template = DefaultKron(sizes) #template = Marginals(sizes) template = DefaultUnionKron(sizes, 3) template.optimize(W) A = template.strategy() def take_measurements(A, data): """ Efficiently take measurements from HDMM strategy and convert to a PGM-compatable form """ A = workload.union_kron_canonical(A) measurements = [] for Ai in A.matrices: w = Ai.weight proj = [
def randomKway(name, number, marginal, seed=0): path = "Datasets/{}.csv".format(name) domain = "Datasets/{}-domain.json".format(name) data = Dataset.load(path, domain) return data, randomKwayData(data, number, marginal, seed)
from mbi import Dataset, FactoredInference, Domain import numpy as np # discrete domain with attributes A, B, C and corresponding size 4 x 5 x 6 domain = Domain(['A','B','C'], [2, 3, 4]) # synthetic dataset with 1000 rows data = Dataset.synthetic(domain, 1000) # project data onto subset of cols, and vectorize ab = data.project(['A','B']).datavector() bc = data.project(['B','C']).datavector() # add noise to preserve differential privacy epsilon = np.sqrt(2) sigma = np.sqrt(2.0) / epsilon np.random.seed(0) yab = ab + np.random.laplace(loc=0, scale=sigma, size=ab.size) ybc = bc + np.random.laplace(loc=0, scale=sigma, size=bc.size) # record the measurements in a form needed by inference Iab = np.eye(ab.size) Ibc = np.eye(bc.size) measurements = [(Iab, yab, sigma, ['A', 'B']), (Ibc, ybc, sigma, ['B', 'C'])] # estimate the data distribution engine = FactoredInference(domain) model = engine.estimate(measurements, engine='MD')
pb_path+=str(i) pb_path+=".csv" print(pb_path) syn_data_privbayes = Dataset.load(pb_path, domain) dq_path=dualquerydata dq_path+=str(i) dq_path+=".csv" print(dq_path) syn_data_dualquery= Dataset.load(dq_path, domain) ''' gm_path = gmdata gm_path += str(i + 1) gm_path += " .csv" print(gm_path) syn_data_r = Dataset.load(gm_path, domain) # err_pb = [] # err_dq = [] err_r = [] print("ss") for p, W in workload: true = W.dot(data.project(p).datavector()) # print(data.project(p).datavector()) # pb = W.dot(syn_data_privbayes.project(p).datavector()) # print(syn_data_privbayes.project(p).datavector()) # dq_data=syn_data_dualquery.project(p).datavector() # dq_data*=total/dq_data.sum() # dq = W.dot(dq_data) # print(syn_data_dualquery.project(p).datavector()) r = W.dot(syn_data_r.project(p).datavector())
def generate(data, query_manager, epsilon, epsilon_0, exponential_scale, samples, alpha=0, show_prgress=True): domain = data.domain D = np.sum(domain.shape) N = data.df.shape[0] Q_size = query_manager.num_queries delta = 1.0 / N**2 beta = 0.05 ## Fail probability prev_queries = [] neg_queries = [] rho_comp = 0.0000 q1 = util.sample(np.ones(Q_size) / Q_size) q2 = util.sample(np.ones(Q_size) / Q_size) prev_queries.append(q1) ## Sample a query from the uniform distribution neg_queries.append(q2) ## Sample a query from the uniform distribution real_answers = query_manager.get_answer(data, debug=False) neg_real_answers = 1 - real_answers final_syn_data = [] t = -1 start_time = time.time() temp = [] if show_prgress: # progress = tqdm(total=0.5 * epsilon ** 2) progress = tqdm(total=epsilon) last_eps = 0 while True: """ End early after 10 minutes """ if time.time() - start_time > 600: break t += 1 rho = 0.5 * epsilon_0**2 rho_comp += rho ## EM privacy current_eps = rho_comp + 2 * np.sqrt(rho_comp * np.log(1 / delta)) if current_eps > epsilon: break if show_prgress: progress.update(current_eps - last_eps) last_eps = current_eps """ Sample s times from FTPL """ util.blockPrint() num_processes = 8 s2 = int(1.0 + samples / num_processes) samples_rem = samples processes = [] manager = mp.Manager() fake_temp = manager.list() query_workload = query_manager.get_query_workload(prev_queries) neg_query_workload = query_manager.get_query_workload(neg_queries) for i in range(num_processes): temp_s = samples_rem if samples_rem - s2 < 0 else s2 samples_rem -= temp_s noise = np.random.exponential(exponential_scale, (temp_s, D)) proc = mp.Process(target=gen_fake_data, args=(fake_temp, query_workload, neg_query_workload, noise, domain, alpha, temp_s)) proc.start() processes.append(proc) assert samples_rem == 0, "samples_rem = {}".format(samples_rem) for p in processes: p.join() util.enablePrint() oh_fake_data = [] assert len(fake_temp) > 0 for x in fake_temp: oh_fake_data.append(x) temp.append(x) if current_eps >= epsilon / 2: ## this trick haves the final error final_syn_data.append(x) assert len(oh_fake_data ) == samples, "len(D_hat) = {} len(fake_data_ = {}".format( len(oh_fake_data), len(fake_temp)) for i in range(samples): assert len(oh_fake_data[i]) == D, "D_hat dim = {}".format( len(D_hat[0])) assert not final_syn_data or len( final_syn_data[0]) == D, "D_hat dim = {}".format( len(oh_fake_data[0])) fake_data = Dataset( pd.DataFrame(util.decode_dataset(oh_fake_data, domain), columns=domain.attrs), domain) """ Compute Exponential Mechanism distribution """ fake_answers = query_manager.get_answer(fake_data, debug=False) neg_fake_answers = 1 - fake_answers score = np.append(real_answers - fake_answers, neg_real_answers - neg_fake_answers) EM_dist_0 = np.exp(epsilon_0 * score * N / 2, dtype=np.float128) sum = np.sum(EM_dist_0) assert sum > 0 and not np.isinf(sum) EM_dist = EM_dist_0 / sum assert not np.isnan( EM_dist).any(), "EM_dist_0 = {} EM_dist = {} sum = {}".format( EM_dist_0, EM_dist, sum) assert not np.isinf( EM_dist).any(), "EM_dist_0 = {} EM_dist = {} sum = {}".format( EM_dist_0, EM_dist, sum) """ Sample from EM """ q_t_ind = util.sample(EM_dist) if q_t_ind < Q_size: prev_queries.append(q_t_ind) else: neg_queries.append(q_t_ind - Q_size) if len(final_syn_data) == 0: final_syn_data = temp fake_data = Dataset( pd.DataFrame(util.decode_dataset(final_syn_data, domain), columns=domain.attrs), domain) return fake_data