Esempio n. 1
0
def DualQuery(data, workload, eps=1.0, delta=0.001, seed=0):
    prng = np.random.RandomState(seed)
    total = data.df.shape[0]
    domain = data.domain
    answers = answer_workload(workload, data) / total

    nu = 2.0
    s = 50
    #T = int(0.5 * ( np.sqrt(4 * eps * total + s * nu) / np.sqrt(s*nu) + 1 ))
    T = 2
    while 2 * nu * (T - 1) / total * (
            np.sqrt(2 * s * (T - 1) * np.log(1.0 / delta) + s *
                    (T - 1) * np.exp(2 * nu * (T - 1) / total) - 1)) < eps:
        T = T + 1
    T = T - 1

    Qsize = sum(W.shape[0] for _, W in workload)
    Xsize = data.domain.size()

    Q = np.ones(Qsize) / Qsize
    cache = []
    #lookup = [Factor(domain.project(cl), q) for cl, W in workload for q in W]
    lookup = [(cl, W, i) for cl, W in workload for i in range(W.shape[0])]
    results = []

    for i in range(T):
        idx = prng.choice(Qsize, s, True, Q)

        #queries = [lookup[i] for i in idx]
        queries = []
        for i in idx:
            cl, W, e = lookup[i]
            dom = domain.project(cl)
            n = W.shape[0]
            z = np.zeros(n)
            z[e] = 1.0
            q = W.T.dot(z)
            queries.append(Factor(dom, -q))

        best = max_sum_ve(queries, data.domain)
        curr = answer_workload(workload, best)

        Q *= np.exp(-nu * (answers - curr))
        Q /= Q.sum()

        cache.append((idx, curr))
        results.append(best.df)

    synthetic = Dataset(pd.concat(results), data.domain)

    print('Iterations', T)
    print('Privacy level', nu * T * (T - 1) * s / total)

    delta = 1e-3
    eps = 2 * nu * (T - 1) / total * (
        np.sqrt(2 * s * (T - 1) * np.log(1.0 / delta) + s *
                (T - 1) * np.exp(2 * nu * (T - 1) / total) - 1))
    print('Approx privacy level', eps, delta)

    return synthetic, cache
Esempio n. 2
0
def privbayes_inference(domain, measurements, total):
    synthetic = pd.DataFrame()

    _, y, _, proj = measurements[0]
    y = np.maximum(y, 0)
    y /= y.sum()
    col = proj[0]
    synthetic[col] = np.random.choice(domain[col], total, True, y)
        
    for _, y, _, proj in measurements[1:]:
        # find the CPT
        col, dep = proj[0], proj[1:]
        print(col)
        y = np.maximum(y, 0)
        dom = domain.project(proj)
        cpt = Factor(dom, y.reshape(dom.shape))
        marg = cpt.project(dep)
        cpt /= marg
        cpt2 = np.moveaxis(cpt.project(proj).values, 0, -1)
        
        # sample current column
        synthetic[col] = 0
        rng = itertools.product(*[range(domain[a]) for a in dep])
        for v in rng:
            idx = (synthetic.loc[:,dep].values == np.array(v)).all(axis=1)
            p = cpt2[v].flatten()
            if p.sum() == 0:
                p = np.ones(p.size) / p.size
            n = domain[col]
            N = idx.sum()
            if N > 0:
                synthetic.loc[idx,col] = np.random.choice(n, N, True, p)

    return Dataset(synthetic, domain)
Esempio n. 3
0
def marginal_loss(marginals, workload, cache):
    answers = []
    for proj, W in workload:
        for cl in marginals:
            if set(proj) <= set(cl):
                mu = marginals[cl].project(proj)
                x = mu.values.flatten()
                answers.append(W.dot(x))
                break
    total = x.sum()
    answers = np.concatenate(answers) / total

    gradient = grad(log_likelihood, argnum=0)
    loss = log_likelihood(answers, cache)
    danswers = gradient(answers, cache)

    i = 0
    gradients = {cl: Factor.zeros(marginals[cl].domain) for cl in marginals}
    for proj, W in workload:
        for cl in marginals:
            if set(proj) <= set(cl):
                m = W.shape[0]
                dmu = W.T.dot(danswers[i:i + m]) / total
                dom = gradients[cl].domain.project(proj)
                gradients[cl] += Factor(dom, dmu)
                i += m
                break

    print(loss)
    return loss, graphical_model.CliqueVector(gradients)
Esempio n. 4
0
 def fit(self, data):
     from mbi import Factor
     assert data.domain.contains(
         self.domain), 'model domain not compatible with data domain'
     marginals = {}
     for cl in self.cliques:
         x = data.project(cl).datavector()
         dom = self.domain.project(cl)
         marginals[cl] = Factor(dom, x)
     self.potentials = self.mle(marginals)
Esempio n. 5
0
    def __init__(self, factors, domain, total):
        """
        :param factors: a list of contingency tables, 
                defined over disjoint subsets of attributes
        :param domain: the domain object
        :param total: known or estimated total
        """
        self.factors = factors
        self.domain = domain
        self.total = total

        for a in domain:
            if not any(a in f.domain for f in factors):
                sub = domain.project([a])
                x = np.ones(domain[a]) / domain[a]
                factors.append(Factor(sub, x))
Esempio n. 6
0
    def postprocess(self):
        iters = self.iters
        domain = self.domain
        engine = FactoredInference(domain,
                                   structural_zeros=None,
                                   iters=500,
                                   log=True,
                                   warm_start=True,
                                   elim_order=self.elimination_order)
        self.engine = engine
        cb = mbi.callbacks.Logger(engine)

        if self.warmup:
            engine._setup(self.measurements, None)
            oneway = {}
            for i in range(len(self.round1)):
                p = self.round1[i]
                y = self.measurements[i][1]
                y = np.maximum(y, 1)
                y /= y.sum()
                oneway[p] = Factor(self.domain.project(p), y)
            marginals = {}
            for cl in engine.model.cliques:
                marginals[cl] = reduce(lambda x, y: x * y,
                                       [oneway[p] for p in cl])

            theta = engine.model.mle(marginals)
            engine.potentials = theta
            engine.marginals = engine.model.belief_prop_fast(theta)

        checkpt = self.save[:-4] + '-checkpt.csv'
        for i in range(self.iters // 500):

            engine.infer(self.measurements, engine='MD', callback=cb)

            if i % 4 == 3:
                self.synthetic = engine.model.synthetic_data()
                self.synthetic = reverse_data(self.synthetic, self.supports)
                self.transform_domain()
                self.synthetic.to_csv(checkpt, index=False)

        if os.path.exists(checkpt):
            os.remove(checkpt)

        self.synthetic = engine.model.synthetic_data()
        self.synthetic = reverse_data(self.synthetic, self.supports)
Esempio n. 7
0
    def multWeightsFast(self, measurements, total):
        domain = self.domain
        groups, projections = _cluster(measurements)
        factors = []
        for group, proj  in zip(groups, projections):
            dom = self.domain.project(proj)
            fact = Factor.uniform(dom)
            for i in range(self.iters):
                update = Factor.zeros(dom)
                for Q, y, noise_scale, p in group:
                    dom2 = dom.project(p)
                    hatx = fact.project(p).values.flatten()*total
                    error = y - Q.dot(hatx)
                    update += Factor(dom2, Q.T.dot(error).reshape(dom2.shape))
                fact *= np.exp(update / (2*total))
                fact /= fact.sum()
            factors.append(fact)

        self.model = ProductDist(factors, self.domain, total) 
Esempio n. 8
0
 def krondot(self, matrices):
     """ Compute the answer to the set of queries Q1 x Q2 X ... x Qd, where 
         Qi is a query matrix on the ith attribute and "x" is the Kronecker product
     This may be more efficient than computing a supporting marginal then multiplying that by Q.
     In particular, if each Qi has only a few rows.
     
     :param matrices: a list of matrices for each attribute in the domain
     :return: the vector of query answers
     """
     assert all(M.shape[1] == n for M, n in zip(matrices, self.domain.shape)), \
         'matrices must conform to the shape of the domain'
     logZ = self.belief_propagation(self.potentials, logZ=True)
     factors = [self.potentials[cl].exp() for cl in self.cliques]
     Factor = type(factors[0])  # infer the type of the factors
     elim = self.domain.attrs
     for attr, Q in zip(elim, matrices):
         d = Domain(['%s-answer' % attr, attr], Q.shape)
         factors.append(Factor(d, Q))
     result = variable_elimination(factors, elim)
     result = result.transpose(['%s-answer' % a for a in elim])
     return result.datavector(flatten=False) * self.total / np.exp(logZ)