Beispiel #1
0
from mbi import Dataset, FactoredInference, Domain
import numpy as np

# discrete domain with attributes A, B, C and corresponding size 4 x 5 x 6 
domain = Domain(['A','B','C'], [2, 3, 4])

# synthetic dataset with 1000 rows
data = Dataset.synthetic(domain, 1000) 

# project data onto subset of cols, and vectorize
ab = data.project(['A','B']).datavector()
bc = data.project(['B','C']).datavector()

# add noise to preserve differential privacy
epsilon = np.sqrt(2)
sigma = np.sqrt(2.0) / epsilon

np.random.seed(0)
yab = ab + np.random.laplace(loc=0, scale=sigma, size=ab.size)
ybc = bc + np.random.laplace(loc=0, scale=sigma, size=bc.size)

# record the measurements in a form needed by inference
Iab = np.eye(ab.size)
Ibc = np.eye(bc.size)

measurements = [(Iab, yab, sigma, ['A', 'B']),
                (Ibc, ybc, sigma, ['B', 'C'])]

# estimate the data distribution
engine = FactoredInference(domain)
model = engine.estimate(measurements, engine='MD')
Beispiel #2
0
from hdmm.templates import DefaultKron, Marginals, DefaultUnionKron
from hdmm import workload
from mbi import FactoredInference, Domain, Dataset
import numpy as np
from IPython import embed

# set up domain and workload
attributes = [
    'A', 'B', 'C'
]  #  should be the names of the columns, for now just using 0 and 1
sizes = [32, 32, 32]
dom = Domain(attributes, sizes)
#W = workload.Prefix2D(32)
W = workload.DimKMarginals(sizes, 1)
data = Dataset.synthetic(dom, 1000)

# optimize strategy using HDMM
#template = DefaultKron(sizes)
#template = Marginals(sizes)
template = DefaultUnionKron(sizes, 3)
template.optimize(W)
A = template.strategy()


def take_measurements(A, data):
    """ Efficiently take measurements from HDMM strategy and convert to a PGM-compatable form """
    A = workload.union_kron_canonical(A)
    measurements = []
    for Ai in A.matrices:
        w = Ai.weight
        proj = [
Beispiel #3
0
def generate(data,
             query_manager,
             epsilon,
             epsilon_0,
             exponential_scale,
             adaptive,
             samples,
             alpha=0,
             timeout=None,
             show_prgress=True):
    domain = data.domain
    D = np.sum(domain.shape)
    N = data.df.shape[0]
    Q_size = query_manager.num_queries
    delta = 1.0 / N**2
    beta = 0.05  ## Fail probability

    prev_queries = []
    neg_queries = []
    rho_comp = 0.0000

    q1 = util2.sample(np.ones(Q_size) / Q_size)
    q2 = util2.sample(np.ones(Q_size) / Q_size)
    prev_queries.append(q1)  ## Sample a query from the uniform distribution
    neg_queries.append(q2)  ## Sample a query from the uniform distribution

    real_answers = query_manager.get_answer(data, debug=False)
    neg_real_answers = 1 - real_answers

    final_syn_data = []
    fem_start_time = time.time()
    temp = []

    # T = util.get_rounds(epsilon, epsilon_0, delta)
    T = util2.get_rounds_zCDP(epsilon, epsilon_0, adaptive, delta)
    if show_prgress:
        progress_bar = tqdm(total=T)
    status = 'OK'
    for t in range(T):
        eps_t = epsilon_0 + adaptive * t

        if show_prgress: progress_bar.update()
        """
        End early after timeout seconds 
        """
        if (timeout is not None) and time.time() - fem_start_time > timeout:
            status = 'Timeout'
            break
        if (timeout is not None
            ) and t >= 1 and (time.time() - fem_start_time) * T / t > timeout:
            status = 'Ending Early ({:.2f}s) '.format(
                (time.time() - fem_start_time) * T / t)
            break
        """
        Sample s times from FTPL
        """
        util2.blockPrint()
        num_processes = 8
        s2 = int(1.0 + samples / num_processes)
        samples_rem = samples
        processes = []
        manager = mp.Manager()
        fake_temp = manager.list()

        query_workload = query_manager.get_query_workload(prev_queries)
        neg_query_workload = query_manager.get_query_workload(neg_queries)

        for __ in range(num_processes):
            temp_s = samples_rem if samples_rem - s2 < 0 else s2
            samples_rem -= temp_s
            noise = np.random.exponential(exponential_scale, (temp_s, D))
            proc = mp.Process(target=gen_fake_data,
                              args=(fake_temp, query_workload,
                                    neg_query_workload, noise, domain, alpha,
                                    temp_s))

            proc.start()
            processes.append(proc)

        assert samples_rem == 0, "samples_rem = {}".format(samples_rem)
        for p in processes:
            p.join()

        util2.enablePrint()
        oh_fake_data = []
        assert len(fake_temp) > 0
        for x in fake_temp:
            oh_fake_data.append(x)
            temp.append(x)
            # if current_eps >= epsilon / 2:  ## this trick haves the final error
            # if t >= T / 2:  ## this trick haves the final error
            final_syn_data.append(x)

        assert len(oh_fake_data
                   ) == samples, "len(D_hat) = {} len(fake_data_ = {}".format(
                       len(oh_fake_data), len(fake_temp))
        for i in range(samples):
            assert len(oh_fake_data[i]) == D, "D_hat dim = {}".format(
                len(oh_fake_data[0]))
        assert not final_syn_data or len(
            final_syn_data[0]) == D, "D_hat dim = {}".format(
                len(oh_fake_data[0]))

        fake_data = Dataset(
            pd.DataFrame(util2.decode_dataset(oh_fake_data, domain),
                         columns=domain.attrs), domain)
        """
        Compute Exponential Mechanism distribution
        """
        fake_answers = query_manager.get_answer(fake_data, debug=False)
        neg_fake_answers = 1 - fake_answers

        score = np.append(real_answers - fake_answers,
                          neg_real_answers - neg_fake_answers)

        EM_dist_0 = np.exp(eps_t * score * N / 2, dtype=np.float128)
        sum = np.sum(EM_dist_0)
        assert sum > 0 and not np.isinf(sum)
        EM_dist = EM_dist_0 / sum
        assert not np.isnan(
            EM_dist).any(), "EM_dist_0 = {} EM_dist = {} sum = {}".format(
                EM_dist_0, EM_dist, sum)
        assert not np.isinf(
            EM_dist).any(), "EM_dist_0 = {} EM_dist = {} sum = {}".format(
                EM_dist_0, EM_dist, sum)
        """
        Sample from EM
        """
        q_t_ind = util2.sample(EM_dist)

        if q_t_ind < Q_size:
            prev_queries.append(q_t_ind)
        else:
            neg_queries.append(q_t_ind - Q_size)

    if len(final_syn_data) == 0:
        status = status + '---syn data.'
        fake_data = Dataset.synthetic(domain, 100)
    else:
        if status == 'OK':
            # Return top halve
            final_syn_data = np.array(final_syn_data)
            final_syn_data = final_syn_data[T // 2:, :]
        fake_data = Dataset(
            pd.DataFrame(util2.decode_dataset(final_syn_data, domain),
                         columns=domain.attrs), domain)
    if show_prgress: progress_bar.close()
    return fake_data, status