from mbi import Dataset, FactoredInference, Domain import numpy as np # discrete domain with attributes A, B, C and corresponding size 4 x 5 x 6 domain = Domain(['A','B','C'], [2, 3, 4]) # synthetic dataset with 1000 rows data = Dataset.synthetic(domain, 1000) # project data onto subset of cols, and vectorize ab = data.project(['A','B']).datavector() bc = data.project(['B','C']).datavector() # add noise to preserve differential privacy epsilon = np.sqrt(2) sigma = np.sqrt(2.0) / epsilon np.random.seed(0) yab = ab + np.random.laplace(loc=0, scale=sigma, size=ab.size) ybc = bc + np.random.laplace(loc=0, scale=sigma, size=bc.size) # record the measurements in a form needed by inference Iab = np.eye(ab.size) Ibc = np.eye(bc.size) measurements = [(Iab, yab, sigma, ['A', 'B']), (Ibc, ybc, sigma, ['B', 'C'])] # estimate the data distribution engine = FactoredInference(domain) model = engine.estimate(measurements, engine='MD')
from hdmm.templates import DefaultKron, Marginals, DefaultUnionKron from hdmm import workload from mbi import FactoredInference, Domain, Dataset import numpy as np from IPython import embed # set up domain and workload attributes = [ 'A', 'B', 'C' ] # should be the names of the columns, for now just using 0 and 1 sizes = [32, 32, 32] dom = Domain(attributes, sizes) #W = workload.Prefix2D(32) W = workload.DimKMarginals(sizes, 1) data = Dataset.synthetic(dom, 1000) # optimize strategy using HDMM #template = DefaultKron(sizes) #template = Marginals(sizes) template = DefaultUnionKron(sizes, 3) template.optimize(W) A = template.strategy() def take_measurements(A, data): """ Efficiently take measurements from HDMM strategy and convert to a PGM-compatable form """ A = workload.union_kron_canonical(A) measurements = [] for Ai in A.matrices: w = Ai.weight proj = [
def generate(data, query_manager, epsilon, epsilon_0, exponential_scale, adaptive, samples, alpha=0, timeout=None, show_prgress=True): domain = data.domain D = np.sum(domain.shape) N = data.df.shape[0] Q_size = query_manager.num_queries delta = 1.0 / N**2 beta = 0.05 ## Fail probability prev_queries = [] neg_queries = [] rho_comp = 0.0000 q1 = util2.sample(np.ones(Q_size) / Q_size) q2 = util2.sample(np.ones(Q_size) / Q_size) prev_queries.append(q1) ## Sample a query from the uniform distribution neg_queries.append(q2) ## Sample a query from the uniform distribution real_answers = query_manager.get_answer(data, debug=False) neg_real_answers = 1 - real_answers final_syn_data = [] fem_start_time = time.time() temp = [] # T = util.get_rounds(epsilon, epsilon_0, delta) T = util2.get_rounds_zCDP(epsilon, epsilon_0, adaptive, delta) if show_prgress: progress_bar = tqdm(total=T) status = 'OK' for t in range(T): eps_t = epsilon_0 + adaptive * t if show_prgress: progress_bar.update() """ End early after timeout seconds """ if (timeout is not None) and time.time() - fem_start_time > timeout: status = 'Timeout' break if (timeout is not None ) and t >= 1 and (time.time() - fem_start_time) * T / t > timeout: status = 'Ending Early ({:.2f}s) '.format( (time.time() - fem_start_time) * T / t) break """ Sample s times from FTPL """ util2.blockPrint() num_processes = 8 s2 = int(1.0 + samples / num_processes) samples_rem = samples processes = [] manager = mp.Manager() fake_temp = manager.list() query_workload = query_manager.get_query_workload(prev_queries) neg_query_workload = query_manager.get_query_workload(neg_queries) for __ in range(num_processes): temp_s = samples_rem if samples_rem - s2 < 0 else s2 samples_rem -= temp_s noise = np.random.exponential(exponential_scale, (temp_s, D)) proc = mp.Process(target=gen_fake_data, args=(fake_temp, query_workload, neg_query_workload, noise, domain, alpha, temp_s)) proc.start() processes.append(proc) assert samples_rem == 0, "samples_rem = {}".format(samples_rem) for p in processes: p.join() util2.enablePrint() oh_fake_data = [] assert len(fake_temp) > 0 for x in fake_temp: oh_fake_data.append(x) temp.append(x) # if current_eps >= epsilon / 2: ## this trick haves the final error # if t >= T / 2: ## this trick haves the final error final_syn_data.append(x) assert len(oh_fake_data ) == samples, "len(D_hat) = {} len(fake_data_ = {}".format( len(oh_fake_data), len(fake_temp)) for i in range(samples): assert len(oh_fake_data[i]) == D, "D_hat dim = {}".format( len(oh_fake_data[0])) assert not final_syn_data or len( final_syn_data[0]) == D, "D_hat dim = {}".format( len(oh_fake_data[0])) fake_data = Dataset( pd.DataFrame(util2.decode_dataset(oh_fake_data, domain), columns=domain.attrs), domain) """ Compute Exponential Mechanism distribution """ fake_answers = query_manager.get_answer(fake_data, debug=False) neg_fake_answers = 1 - fake_answers score = np.append(real_answers - fake_answers, neg_real_answers - neg_fake_answers) EM_dist_0 = np.exp(eps_t * score * N / 2, dtype=np.float128) sum = np.sum(EM_dist_0) assert sum > 0 and not np.isinf(sum) EM_dist = EM_dist_0 / sum assert not np.isnan( EM_dist).any(), "EM_dist_0 = {} EM_dist = {} sum = {}".format( EM_dist_0, EM_dist, sum) assert not np.isinf( EM_dist).any(), "EM_dist_0 = {} EM_dist = {} sum = {}".format( EM_dist_0, EM_dist, sum) """ Sample from EM """ q_t_ind = util2.sample(EM_dist) if q_t_ind < Q_size: prev_queries.append(q_t_ind) else: neg_queries.append(q_t_ind - Q_size) if len(final_syn_data) == 0: status = status + '---syn data.' fake_data = Dataset.synthetic(domain, 100) else: if status == 'OK': # Return top halve final_syn_data = np.array(final_syn_data) final_syn_data = final_syn_data[T // 2:, :] fake_data = Dataset( pd.DataFrame(util2.decode_dataset(final_syn_data, domain), columns=domain.attrs), domain) if show_prgress: progress_bar.close() return fake_data, status