def select(data, epsilon, measurement_log, cliques=[]): engine = FactoredInference(data.domain, iters=1000) est = engine.estimate(measurement_log) weights = {} candidates = list(itertools.combinations(data.domain.attrs, 2)) for a, b in candidates: xhat = est.project([a, b]).datavector() x = data.project([a, b]).datavector() weights[a, b] = np.linalg.norm(x - xhat, 1) T = nx.Graph() T.add_nodes_from(data.domain.attrs) ds = DisjointSet() for e in cliques: T.add_edge(*e) ds.union(*e) r = len(list(nx.connected_components(T))) for i in range(r - 1): candidates = [e for e in candidates if not ds.connected(*e)] wgts = np.array([weights[e] for e in candidates]) idx = permute_and_flip(wgts, epsilon / (r - 1), sensitivity=1.0) e = candidates[idx] T.add_edge(*e) ds.union(*e) return list(T.edges)
def run(dataset, measurements, eps=1.0, delta=0.0, bounded=True, engine='MD', options={}, iters=10000, seed=None, metric='L2', elim_order=None, frequency=1,workload=None): """ Run a mechanism that measures the given measurements and runs inference. This is a convenience method for running end-to-end experiments. """ domain = dataset.domain total = None state = np.random.RandomState(seed) if len(measurements) >= 1 and type(measurements[0][0]) is str: matrix = lambda proj: sparse.eye(domain.project(proj).size()) measurements = [(proj, matrix(proj)) for proj in measurements] l1 = 0 l2 = 0 for _, Q in measurements: l1 += np.abs(Q).sum(axis=0).max() try: l2 += Q.power(2).sum(axis=0).max() # for spares matrices except: l2 += np.square(Q).sum(axis=0).max() # for dense matrices if bounded: total = dataset.df.shape[0] l1 *= 2 l2 *= 2 if delta > 0: noise = norm(loc=0, scale=np.sqrt(l2 * 2 * np.log(2/delta))/eps) else: noise = laplace(loc=0, scale=l1/eps) if workload is None: workload = measurements truth = [] for proj, W, in workload: x = dataset.project(proj).datavector() y = W.dot(x) truth.append( (W, y, proj) ) answers = [] for proj, Q in measurements: x = dataset.project(proj).datavector() z = noise.rvs(size=Q.shape[0], random_state=state) y = Q.dot(x) answers.append( (Q, y+z, 1.0, proj) ) estimator = FactoredInference(domain, metric=metric, iters=iters, warm_start=False, elim_order=elim_order) logger = Logger(estimator, true_answers=truth, frequency=frequency) model = estimator.estimate(answers, total, engine=engine, callback=logger, options=options) return model, logger, answers
def MST(data, epsilon, delta): # This mechanism is designed for relatively large high-dimensional datasets # for lower-dimensional datasets (like adult), simpler mechanisms may be better sigma = calibrate_gaussian_noise(epsilon * 2.0 / 3.0, delta) cliques = [(col, ) for col in data.domain] log1 = measure(data, cliques, sigma) data, log1, undo_compress_fn = compress_domain(data, log1) cliques = select(data, epsilon / 3.0, log1) log2 = measure(data, cliques, sigma) engine = FactoredInference(data.domain, iters=5000) est = engine.estimate(log1 + log2) synth = est.synthetic_data() return undo_compress_fn(synth)
def postprocess(self): #use noisy measurements to fit PGM inference #and generate synthetic data iters = self.iters domain = self.domain temp_domain = Domain.fromdict(domain) engine = FactoredInference(temp_domain, structural_zeros=None, iters=10000, log=True, warm_start=False, elim_order=self.elimination_order) self.engine = engine engine.estimate(self.measurements) self.synthetic = self.engine.model.synthetic_data() self.synthetic = reverse_data(self.synthetic, self.supports)
def synthesize(self, file_path, eps, seed): # setup random state prng = np.random.RandomState(seed) # load data vector relation = Relation(self.config) relation.load_csv(file_path) self._numerize(relation._df) # perform measurement attributes = [field_name for field_name in self.config.keys()] measurements = [] w_sum = sum(Ai.weight for Ai in self.strategy.matrices) for Ai in self.strategy.matrices: w = Ai.weight proj = [ attributes[i] for i, B in enumerate(Ai.base.matrices) if type(B).__name__ != 'Ones' ] matrix = [ B for B in Ai.base.matrices if type(B).__name__ != 'Ones' ] matrix = EkteloMatrix(np.ones( (1, 1))) if len(matrix) == 0 else Kronecker(matrix) proj_rel = copy.deepcopy(relation) proj_rel.project(proj) if proj_rel.df.shape[1] == 0: x = np.array([proj_rel.df.shape[0]]) else: x = Vectorize('').transform(proj_rel).flatten() y = Laplace(matrix, w * eps / w_sum).measure(x, prng) measurements.append((matrix.sparse_matrix(), y, 1.0 / w, proj)) # generate synthetic data sizes = [field['bins'] for field in self.config.values()] dom = Domain(attributes, sizes) engine = FactoredInference(dom) model = engine.estimate(measurements) df = model.synthetic_data().df self._denumerize(df) self._sample_numerical(df) return df
def postprocess(self): iters = self.iters domain = self.domain engine = FactoredInference(domain, structural_zeros=None, iters=500, log=True, warm_start=True, elim_order=self.elimination_order) self.engine = engine cb = mbi.callbacks.Logger(engine) if self.warmup: engine._setup(self.measurements, None) oneway = {} for i in range(len(self.round1)): p = self.round1[i] y = self.measurements[i][1] y = np.maximum(y, 1) y /= y.sum() oneway[p] = Factor(self.domain.project(p), y) marginals = {} for cl in engine.model.cliques: marginals[cl] = reduce(lambda x, y: x * y, [oneway[p] for p in cl]) theta = engine.model.mle(marginals) engine.potentials = theta engine.marginals = engine.model.belief_prop_fast(theta) checkpt = self.save[:-4] + '-checkpt.csv' for i in range(self.iters // 500): engine.infer(self.measurements, engine='MD', callback=cb) if i % 4 == 3: self.synthetic = engine.model.synthetic_data() self.synthetic = reverse_data(self.synthetic, self.supports) self.transform_domain() self.synthetic.to_csv(checkpt, index=False) if os.path.exists(checkpt): os.remove(checkpt) self.synthetic = engine.model.synthetic_data() self.synthetic = reverse_data(self.synthetic, self.supports)
total = data.df.shape[0] workload = [] for cl, W in workloads: workload.append((cl, matrix.VStack([W, Negated(W)]))) synthetic, cache = DualQuery(data, workload, eps=args.epsilon, delta=1e-3, seed=args.seed) metric = lambda marginals: marginal_loss(marginals, workload, cache) engine = FactoredInference(data.domain, metric=metric, iters=args.iters) measurements = [(Q, None, 1.0, cl) for cl, Q in workload] ans = engine.mirror_descent(measurements, total) model = engine.model mb = [] dq = [] for proj, W in workloads: est = W.dot(model.project(proj).datavector()) x = synthetic.project(proj).datavector() x *= total / x.sum() est2 = W.dot(x) true = W.dot(data.project(proj).datavector()) err = np.abs(est - true).sum() / np.abs(true).sum() err2 = np.abs(est2 - true).sum() / np.abs(true).sum()
""" Efficiently take measurements from HDMM strategy and convert to a PGM-compatable form """ A = workload.union_kron_canonical(A) measurements = [] for Ai in A.matrices: w = Ai.weight proj = [ attributes[i] for i, B in enumerate(Ai.base.matrices) if type(B) != workload.Ones ] print(proj) matrix = workload.Kronecker( [B for B in Ai.base.matrices if type(B) != workload.Ones]) matrix = w * matrix.sparse_matrix() x = data.project( proj).datavector() # does Relation have this functionality? y = matrix.dot(x) + np.random.laplace( loc=0, scale=1, size=matrix.shape[0]) measurements.append((matrix, y, 1.0, proj)) return measurements measurements = take_measurements(A, data) engine = FactoredInference(dom) model = engine.estimate(measurements) df = model.synthetic_data().df print(df.head()) # Then you can post-process to change category/bin ids with values
def setUp(self): if skip: raise unittest.SkipTest('PyTorch not installed') test_inference.TestInference.setUp(self) self.engine = FactoredInference(self.domain, backend='torch', log=True)
epsilon = np.sqrt(2) sigma = np.sqrt(2.0) / epsilon np.random.seed(0) yab = ab + np.random.laplace(loc=0, scale=sigma, size=ab.size) ybc = bc + np.random.laplace(loc=0, scale=sigma, size=bc.size) # record the measurements in a form needed by inference Iab = np.eye(ab.size) Ibc = np.eye(bc.size) measurements = [(Iab, yab, sigma, ['A', 'B']), (Ibc, ybc, sigma, ['B', 'C'])] # estimate the data distribution engine = FactoredInference(domain) model = engine.estimate(measurements, engine='MD') # recover consistent estimates of measurements ab2 = model.project(['A','B']).datavector() bc2 = model.project(['B','C']).datavector() print(ab2) print(bc2) # estimate answer to unmeasured queries ac2 = model.project(['A','C']).datavector() print(ac2) # generate synthetic data
parser = argparse.ArgumentParser(description=description, formatter_class=formatter) parser.add_argument('--dataset', choices=['adult'], help='dataset to use') parser.add_argument('--engine', choices=['MW','PGM'], help='inference engine') parser.add_argument('--iters', type=int, help='number of optimization iterations') parser.add_argument('--rounds', type=int, help='number of rounds to run mwem') parser.add_argument('--epsilon', type=float, help='privacy parameter') parser.add_argument('--seed', type=int, help='random seed') parser.set_defaults(**default_params()) args = parser.parse_args() data, workloads = benchmarks.adult_benchmark() prng = np.random.RandomState(args.seed) if args.engine == 'MW': engine = FactoredMultiplicativeWeights(data.domain, iters=args.iters) else: engine = FactoredInference(data.domain, iters=args.iters) if args.rounds is None: rounds = len(data.domain) else: rounds = args.rounds ans = mwem(workloads, data, args.epsilon, engine, iters=rounds, prng=prng) error = average_error(workloads, data, ans) print('Error: %.3f' % error)
measurements = [] for col in data.domain: x = data.project(col).datavector() y = x + np.random.laplace(loc=0, scale=sigma, size=x.size) I = Identity(x.size) measurements.append((I, y, sigma, (col, ))) # spend half of privacy budget to measure some more 2 and 3 way marginals cliques = [('age', 'education-num'), ('marital-status', 'race'), ('sex', 'hours-per-week'), ('hours-per-week', 'income>50K'), ('native-country', 'marital-status', 'occupation')] sigma = 1.0 / len(cliques) / 2.0 for cl in cliques: x = data.project(cl).datavector() y = x + np.random.laplace(loc=0, scale=sigma, size=x.size) I = Identity(x.size) measurements.append((I, y, sigma, cl)) # now perform inference to estimate the data distribution engine = FactoredInference(domain, backend='torch', log=True, iters=10000) model = engine.estimate(measurements, total=total, engine='RDA') # now answer new queries y1 = model.project(('sex', 'income>50K')).datavector() y2 = model.project(('race', 'occupation')).datavector()