コード例 #1
0
ファイル: mst.py プロジェクト: meijiu/private-pgm
def select(data, epsilon, measurement_log, cliques=[]):
    engine = FactoredInference(data.domain, iters=1000)
    est = engine.estimate(measurement_log)

    weights = {}
    candidates = list(itertools.combinations(data.domain.attrs, 2))
    for a, b in candidates:
        xhat = est.project([a, b]).datavector()
        x = data.project([a, b]).datavector()
        weights[a, b] = np.linalg.norm(x - xhat, 1)

    T = nx.Graph()
    T.add_nodes_from(data.domain.attrs)
    ds = DisjointSet()

    for e in cliques:
        T.add_edge(*e)
        ds.union(*e)

    r = len(list(nx.connected_components(T)))

    for i in range(r - 1):
        candidates = [e for e in candidates if not ds.connected(*e)]
        wgts = np.array([weights[e] for e in candidates])
        idx = permute_and_flip(wgts, epsilon / (r - 1), sensitivity=1.0)
        e = candidates[idx]
        T.add_edge(*e)
        ds.union(*e)

    return list(T.edges)
コード例 #2
0
def run(dataset, measurements, eps=1.0, delta=0.0, bounded=True, engine='MD', 
        options={}, iters=10000, seed=None, metric='L2', elim_order=None, frequency=1,workload=None):
    """
    Run a mechanism that measures the given measurements and runs inference.
    This is a convenience method for running end-to-end experiments.
    """

    domain = dataset.domain
    total = None

    state = np.random.RandomState(seed)

    if len(measurements) >= 1 and type(measurements[0][0]) is str:
        matrix = lambda proj: sparse.eye(domain.project(proj).size())
        measurements = [(proj, matrix(proj)) for proj in measurements]

    l1 = 0
    l2 = 0
    for _, Q in measurements:
        l1 += np.abs(Q).sum(axis=0).max()
        try: l2 += Q.power(2).sum(axis=0).max()  # for spares matrices
        except: l2 += np.square(Q).sum(axis=0).max() # for dense matrices

    if bounded:
        total = dataset.df.shape[0]
        l1 *= 2
        l2 *= 2

    if delta > 0:
        noise = norm(loc=0, scale=np.sqrt(l2 * 2 * np.log(2/delta))/eps)
    else:
        noise = laplace(loc=0, scale=l1/eps)

    if workload is None:
        workload = measurements
   
    truth = [] 
    for proj, W, in workload:
        x = dataset.project(proj).datavector()
        y = W.dot(x)
        truth.append( (W, y, proj) )

    answers = []
    for proj, Q in measurements:
        x = dataset.project(proj).datavector()
        z = noise.rvs(size=Q.shape[0], random_state=state)
        y = Q.dot(x)
        answers.append( (Q, y+z, 1.0, proj) )

    estimator = FactoredInference(domain, metric=metric, iters=iters, warm_start=False, elim_order=elim_order)
    logger = Logger(estimator, true_answers=truth, frequency=frequency)
    model = estimator.estimate(answers, total, engine=engine, callback=logger, options=options)
        
    return model, logger, answers
コード例 #3
0
ファイル: mst.py プロジェクト: meijiu/private-pgm
def MST(data, epsilon, delta):
    # This mechanism is designed for relatively large high-dimensional datasets
    # for lower-dimensional datasets (like adult), simpler mechanisms may be better
    sigma = calibrate_gaussian_noise(epsilon * 2.0 / 3.0, delta)
    cliques = [(col, ) for col in data.domain]
    log1 = measure(data, cliques, sigma)
    data, log1, undo_compress_fn = compress_domain(data, log1)
    cliques = select(data, epsilon / 3.0, log1)
    log2 = measure(data, cliques, sigma)
    engine = FactoredInference(data.domain, iters=5000)
    est = engine.estimate(log1 + log2)
    synth = est.synthetic_data()
    return undo_compress_fn(synth)
コード例 #4
0
    def postprocess(self):
        #use noisy measurements to fit PGM inference
        #and generate synthetic data
        iters = self.iters
        domain = self.domain
        temp_domain = Domain.fromdict(domain)
        engine = FactoredInference(temp_domain,
                                   structural_zeros=None,
                                   iters=10000,
                                   log=True,
                                   warm_start=False,
                                   elim_order=self.elimination_order)
        self.engine = engine
        engine.estimate(self.measurements)

        self.synthetic = self.engine.model.synthetic_data()
        self.synthetic = reverse_data(self.synthetic, self.supports)
コード例 #5
0
    def synthesize(self, file_path, eps, seed):
        # setup random state
        prng = np.random.RandomState(seed)

        # load data vector
        relation = Relation(self.config)
        relation.load_csv(file_path)
        self._numerize(relation._df)

        # perform measurement
        attributes = [field_name for field_name in self.config.keys()]
        measurements = []
        w_sum = sum(Ai.weight for Ai in self.strategy.matrices)
        for Ai in self.strategy.matrices:
            w = Ai.weight
            proj = [
                attributes[i] for i, B in enumerate(Ai.base.matrices)
                if type(B).__name__ != 'Ones'
            ]
            matrix = [
                B for B in Ai.base.matrices if type(B).__name__ != 'Ones'
            ]
            matrix = EkteloMatrix(np.ones(
                (1, 1))) if len(matrix) == 0 else Kronecker(matrix)
            proj_rel = copy.deepcopy(relation)
            proj_rel.project(proj)
            if proj_rel.df.shape[1] == 0:
                x = np.array([proj_rel.df.shape[0]])
            else:
                x = Vectorize('').transform(proj_rel).flatten()
            y = Laplace(matrix, w * eps / w_sum).measure(x, prng)
            measurements.append((matrix.sparse_matrix(), y, 1.0 / w, proj))

        # generate synthetic data
        sizes = [field['bins'] for field in self.config.values()]
        dom = Domain(attributes, sizes)
        engine = FactoredInference(dom)
        model = engine.estimate(measurements)
        df = model.synthetic_data().df
        self._denumerize(df)
        self._sample_numerical(df)

        return df
コード例 #6
0
    def postprocess(self):
        iters = self.iters
        domain = self.domain
        engine = FactoredInference(domain,
                                   structural_zeros=None,
                                   iters=500,
                                   log=True,
                                   warm_start=True,
                                   elim_order=self.elimination_order)
        self.engine = engine
        cb = mbi.callbacks.Logger(engine)

        if self.warmup:
            engine._setup(self.measurements, None)
            oneway = {}
            for i in range(len(self.round1)):
                p = self.round1[i]
                y = self.measurements[i][1]
                y = np.maximum(y, 1)
                y /= y.sum()
                oneway[p] = Factor(self.domain.project(p), y)
            marginals = {}
            for cl in engine.model.cliques:
                marginals[cl] = reduce(lambda x, y: x * y,
                                       [oneway[p] for p in cl])

            theta = engine.model.mle(marginals)
            engine.potentials = theta
            engine.marginals = engine.model.belief_prop_fast(theta)

        checkpt = self.save[:-4] + '-checkpt.csv'
        for i in range(self.iters // 500):

            engine.infer(self.measurements, engine='MD', callback=cb)

            if i % 4 == 3:
                self.synthetic = engine.model.synthetic_data()
                self.synthetic = reverse_data(self.synthetic, self.supports)
                self.transform_domain()
                self.synthetic.to_csv(checkpt, index=False)

        if os.path.exists(checkpt):
            os.remove(checkpt)

        self.synthetic = engine.model.synthetic_data()
        self.synthetic = reverse_data(self.synthetic, self.supports)
コード例 #7
0
ファイル: dual_query.py プロジェクト: ypzhang725/private-pgm
    total = data.df.shape[0]

    workload = []
    for cl, W in workloads:
        workload.append((cl, matrix.VStack([W, Negated(W)])))

    synthetic, cache = DualQuery(data,
                                 workload,
                                 eps=args.epsilon,
                                 delta=1e-3,
                                 seed=args.seed)

    metric = lambda marginals: marginal_loss(marginals, workload, cache)

    engine = FactoredInference(data.domain, metric=metric, iters=args.iters)
    measurements = [(Q, None, 1.0, cl) for cl, Q in workload]

    ans = engine.mirror_descent(measurements, total)
    model = engine.model
    mb = []
    dq = []

    for proj, W in workloads:
        est = W.dot(model.project(proj).datavector())
        x = synthetic.project(proj).datavector()
        x *= total / x.sum()
        est2 = W.dot(x)
        true = W.dot(data.project(proj).datavector())
        err = np.abs(est - true).sum() / np.abs(true).sum()
        err2 = np.abs(est2 - true).sum() / np.abs(true).sum()
コード例 #8
0
    """ Efficiently take measurements from HDMM strategy and convert to a PGM-compatable form """
    A = workload.union_kron_canonical(A)
    measurements = []
    for Ai in A.matrices:
        w = Ai.weight
        proj = [
            attributes[i] for i, B in enumerate(Ai.base.matrices)
            if type(B) != workload.Ones
        ]
        print(proj)
        matrix = workload.Kronecker(
            [B for B in Ai.base.matrices if type(B) != workload.Ones])
        matrix = w * matrix.sparse_matrix()
        x = data.project(
            proj).datavector()  # does Relation have this functionality?
        y = matrix.dot(x) + np.random.laplace(
            loc=0, scale=1, size=matrix.shape[0])
        measurements.append((matrix, y, 1.0, proj))
    return measurements


measurements = take_measurements(A, data)

engine = FactoredInference(dom)
model = engine.estimate(measurements)

df = model.synthetic_data().df
print(df.head())

# Then you can post-process to change category/bin ids with values
コード例 #9
0
 def setUp(self):
     if skip: raise unittest.SkipTest('PyTorch not installed')
     test_inference.TestInference.setUp(self)
     self.engine = FactoredInference(self.domain, backend='torch', log=True)
コード例 #10
0
ファイル: toy_example.py プロジェクト: ypzhang725/private-pgm
epsilon = np.sqrt(2)
sigma = np.sqrt(2.0) / epsilon

np.random.seed(0)
yab = ab + np.random.laplace(loc=0, scale=sigma, size=ab.size)
ybc = bc + np.random.laplace(loc=0, scale=sigma, size=bc.size)

# record the measurements in a form needed by inference
Iab = np.eye(ab.size)
Ibc = np.eye(bc.size)

measurements = [(Iab, yab, sigma, ['A', 'B']),
                (Ibc, ybc, sigma, ['B', 'C'])]

# estimate the data distribution
engine = FactoredInference(domain)
model = engine.estimate(measurements, engine='MD')

# recover consistent estimates of measurements
ab2 = model.project(['A','B']).datavector()
bc2 = model.project(['B','C']).datavector()

print(ab2)

print(bc2)

# estimate answer to unmeasured queries
ac2 = model.project(['A','C']).datavector()
print(ac2)

# generate synthetic data
コード例 #11
0
    parser = argparse.ArgumentParser(description=description, formatter_class=formatter)
    parser.add_argument('--dataset', choices=['adult'], help='dataset to use')
    parser.add_argument('--engine', choices=['MW','PGM'], help='inference engine')
    parser.add_argument('--iters', type=int, help='number of optimization iterations')
    parser.add_argument('--rounds', type=int, help='number of rounds to run mwem')
    parser.add_argument('--epsilon', type=float, help='privacy  parameter')
    parser.add_argument('--seed', type=int, help='random seed')

    parser.set_defaults(**default_params())
    args = parser.parse_args()

    data, workloads = benchmarks.adult_benchmark()
    prng = np.random.RandomState(args.seed)

    if args.engine == 'MW':
        engine = FactoredMultiplicativeWeights(data.domain, iters=args.iters)
    else:
        engine = FactoredInference(data.domain, iters=args.iters) 

    if args.rounds is None:
        rounds = len(data.domain)
    else:
        rounds = args.rounds

    ans = mwem(workloads, data, args.epsilon, engine, iters=rounds, prng=prng)

    error = average_error(workloads, data, ans)

    print('Error: %.3f' % error)
    
コード例 #12
0
measurements = []
for col in data.domain:
    x = data.project(col).datavector()
    y = x + np.random.laplace(loc=0, scale=sigma, size=x.size)
    I = Identity(x.size)
    measurements.append((I, y, sigma, (col, )))

# spend half of privacy budget to measure some more 2 and 3 way marginals

cliques = [('age', 'education-num'), ('marital-status', 'race'),
           ('sex', 'hours-per-week'), ('hours-per-week', 'income>50K'),
           ('native-country', 'marital-status', 'occupation')]

sigma = 1.0 / len(cliques) / 2.0

for cl in cliques:
    x = data.project(cl).datavector()
    y = x + np.random.laplace(loc=0, scale=sigma, size=x.size)
    I = Identity(x.size)
    measurements.append((I, y, sigma, cl))

# now perform inference to estimate the data distribution

engine = FactoredInference(domain, backend='torch', log=True, iters=10000)
model = engine.estimate(measurements, total=total, engine='RDA')

# now answer new queries

y1 = model.project(('sex', 'income>50K')).datavector()
y2 = model.project(('race', 'occupation')).datavector()