def test_binary(self): dom = Domain(['b', 'd', 'e'], [3, 5, 6]) vals = torch.rand(3, 5, 6) factor = Factor(dom, vals) res = self.factor * factor ans = Domain(['a', 'b', 'c', 'd', 'e'], [2, 3, 4, 5, 6]) self.assertEqual(res.domain, ans) res = self.factor + factor self.assertEqual(res.domain, ans) res = self.factor * 2.0 self.assertEqual(res.domain, self.factor.domain) res = self.factor + 2.0 self.assertEqual(res.domain, self.factor.domain) res = self.factor - 2.0 self.assertEqual(res.domain, self.factor.domain) res = self.factor.exp().log() self.assertEqual(res.domain, self.factor.domain) self.assertTrue(np.allclose(res.datavector(), self.factor.datavector()))
def __init__(self, dataset, specs): self.dataset = dataset self.specs = json.load(open(specs, 'r')) domain_info = json.load(open('domain.json')) # check consistency for codebook information for col in list(domain_info): if domain_info[col][-1] < self.specs[col]['maxval']: print('Codebook inconsistent for', col) del domain_info[col] ## look at ground truth data to obtain possible values for state-dependent columns df = pd.read_csv(dataset) for col in ['SEA', 'METAREA', 'COUNTY', 'CITY', 'METAREAD']: domain_info[col] = sorted(df[col].unique()) ## done using ground truth data domain = { } for col in self.specs: if col in domain_info: domain[col] = len(domain_info[col]) else: domain[col] = self.specs[col]['maxval'] + 1 domain['INCWAGE_A'] = 52 domain['INCWAGE_B'] = 8 del domain['INCWAGE'] #domain['INCWAGE'] = 5002 domain['VALUEH'] = 5003 self.domain_info = domain_info self.domain = Domain.fromdict(domain)
def setUp(self): if skip: raise unittest.SkipTest('PyTorch not installed') attrs = ['a', 'b', 'c'] shape = [2, 3, 4] domain = Domain(attrs, shape) values = torch.rand(*shape) self.factor = Factor(domain, values)
def test_expand(self): domain = Domain(['a', 'b', 'c', 'd'], [2, 3, 4, 5]) res = self.factor.expand(domain) self.assertEqual(res.domain, domain) self.assertEqual(res.values.shape, domain.shape) res = res.sum(['d']) * 0.2 self.assertTrue(torch.allclose(res.values, self.factor.values))
def test_project(self): res = self.factor.project(['c', 'a'], agg='sum') ans = Domain(['c', 'a'], [4, 2]) self.assertEqual(res.domain, ans) self.assertEqual(res.values.shape, (4, 2)) res = self.factor.project(['c', 'a'], agg='logsumexp') self.assertEqual(res.domain, ans) self.assertEqual(res.values.shape, (4, 2))
def load(path, domain): """ Load data into a dataset object :param path: path to csv file :param domain: path to json file encoding the domain information """ df = pd.read_csv(path) config = json.load(open(domain)) domain = Domain(config.keys(), config.values()) return Dataset(df, domain)
def reverse_data(data, supports): df = data.df.copy() newdom = {} for col in data.domain: support = supports[col] mx = support.sum() newdom[col] = int(support.size) idx, extra = np.where(support)[0], np.where(~support)[0] mask = df[col] == mx if extra.size == 0: pass else: df.loc[mask, col] = np.random.choice(extra, mask.sum()) df.loc[~mask, col] = idx[df.loc[~mask, col]] newdom = Domain.fromdict(newdom) return Dataset(df, newdom)
def postprocess(self): #use noisy measurements to fit PGM inference #and generate synthetic data iters = self.iters domain = self.domain temp_domain = Domain.fromdict(domain) engine = FactoredInference(temp_domain, structural_zeros=None, iters=10000, log=True, warm_start=False, elim_order=self.elimination_order) self.engine = engine engine.estimate(self.measurements) self.synthetic = self.engine.model.synthetic_data() self.synthetic = reverse_data(self.synthetic, self.supports)
def synthesize(self, file_path, eps, seed): # setup random state prng = np.random.RandomState(seed) # load data vector relation = Relation(self.config) relation.load_csv(file_path) self._numerize(relation._df) # perform measurement attributes = [field_name for field_name in self.config.keys()] measurements = [] w_sum = sum(Ai.weight for Ai in self.strategy.matrices) for Ai in self.strategy.matrices: w = Ai.weight proj = [ attributes[i] for i, B in enumerate(Ai.base.matrices) if type(B).__name__ != 'Ones' ] matrix = [ B for B in Ai.base.matrices if type(B).__name__ != 'Ones' ] matrix = EkteloMatrix(np.ones( (1, 1))) if len(matrix) == 0 else Kronecker(matrix) proj_rel = copy.deepcopy(relation) proj_rel.project(proj) if proj_rel.df.shape[1] == 0: x = np.array([proj_rel.df.shape[0]]) else: x = Vectorize('').transform(proj_rel).flatten() y = Laplace(matrix, w * eps / w_sum).measure(x, prng) measurements.append((matrix.sparse_matrix(), y, 1.0 / w, proj)) # generate synthetic data sizes = [field['bins'] for field in self.config.values()] dom = Domain(attributes, sizes) engine = FactoredInference(dom) model = engine.estimate(measurements) df = model.synthetic_data().df self._denumerize(df) self._sample_numerical(df) return df
def transform_data(data, supports): df = data.df.copy() newdom = {} for col in data.domain: support = supports[col] size = support.sum() newdom[col] = int(size) if size < support.size: newdom[col] += 1 mapping = {} idx = 0 for i in range(support.size): mapping[i] = size if support[i]: mapping[i] = idx idx += 1 assert idx == size df[col] = df[col].map(mapping) newdom = Domain.fromdict(newdom) return Dataset(df, newdom)
def randomKway(name, number, marginal, proj=None, seed=0, filter=None, root_path='./', args=None): check_size = name in ['adult_orig', 'loans'] path = os.path.join(root_path, "Datasets/{}.csv".format(name)) df = pd.read_csv(path) domain = os.path.join(root_path, "Datasets/{}-domain.json".format(name)) config = json.load(open(domain)) domain = Domain(config.keys(), config.values()) if name == 'adult': if args.adult_seed is not None: prng = np.random.RandomState(args.adult_seed) mask = prng.binomial(1, 0.9, size=len(df)) df.loc[:, '_split'] = mask else: df.loc[:, '_split'] = 1 if filter is not None: col, val = filter df = df[df[col] == val].reset_index(drop=True) del df[col] domain_max = max(domain.config.values()) dtype = get_min_dtype(domain_max) df = df.astype(dtype) data = Dataset(df, domain) if proj is not None: data = data.project(proj) return data, randomKwayData(data, number, marginal, seed, check_size=check_size)
def krondot(self, matrices): """ Compute the answer to the set of queries Q1 x Q2 X ... x Qd, where Qi is a query matrix on the ith attribute and "x" is the Kronecker product This may be more efficient than computing a supporting marginal then multiplying that by Q. In particular, if each Qi has only a few rows. :param matrices: a list of matrices for each attribute in the domain :return: the vector of query answers """ assert all(M.shape[1] == n for M, n in zip(matrices, self.domain.shape)), \ 'matrices must conform to the shape of the domain' logZ = self.belief_propagation(self.potentials, logZ=True) factors = [self.potentials[cl].exp() for cl in self.cliques] Factor = type(factors[0]) # infer the type of the factors elim = self.domain.attrs for attr, Q in zip(elim, matrices): d = Domain(['%s-answer' % attr, attr], Q.shape) factors.append(Factor(d, Q)) result = variable_elimination(factors, elim) result = result.transpose(['%s-answer' % a for a in elim]) return result.datavector(flatten=False) * self.total / np.exp(logZ)
from hdmm.templates import DefaultKron, Marginals, DefaultUnionKron from hdmm import workload from mbi import FactoredInference, Domain, Dataset import numpy as np from IPython import embed # set up domain and workload attributes = [ 'A', 'B', 'C' ] # should be the names of the columns, for now just using 0 and 1 sizes = [32, 32, 32] dom = Domain(attributes, sizes) #W = workload.Prefix2D(32) W = workload.DimKMarginals(sizes, 1) data = Dataset.synthetic(dom, 1000) # optimize strategy using HDMM #template = DefaultKron(sizes) #template = Marginals(sizes) template = DefaultUnionKron(sizes, 3) template.optimize(W) A = template.strategy() def take_measurements(A, data): """ Efficiently take measurements from HDMM strategy and convert to a PGM-compatable form """ A = workload.union_kron_canonical(A) measurements = [] for Ai in A.matrices: w = Ai.weight proj = [
from mbi import Dataset, FactoredInference, Domain import numpy as np # discrete domain with attributes A, B, C and corresponding size 4 x 5 x 6 domain = Domain(['A','B','C'], [2, 3, 4]) # synthetic dataset with 1000 rows data = Dataset.synthetic(domain, 1000) # project data onto subset of cols, and vectorize ab = data.project(['A','B']).datavector() bc = data.project(['B','C']).datavector() # add noise to preserve differential privacy epsilon = np.sqrt(2) sigma = np.sqrt(2.0) / epsilon np.random.seed(0) yab = ab + np.random.laplace(loc=0, scale=sigma, size=ab.size) ybc = bc + np.random.laplace(loc=0, scale=sigma, size=bc.size) # record the measurements in a form needed by inference Iab = np.eye(ab.size) Ibc = np.eye(bc.size) measurements = [(Iab, yab, sigma, ['A', 'B']), (Ibc, ybc, sigma, ['B', 'C'])] # estimate the data distribution engine = FactoredInference(domain) model = engine.estimate(measurements, engine='MD')
def test_logsumexp(self): res = self.factor.logsumexp(['a', 'c']) values = self.factor.values ans = torch.log(torch.sum(torch.exp(values), dim=(0, 2))) self.assertEqual(res.domain, Domain(['b'], [3])) self.assertTrue(torch.allclose(res.values, ans))
def test_sum(self): res = self.factor.sum(['a', 'b']) self.assertEqual(res.domain, Domain(['c'], [4])) self.assertTrue( torch.allclose(res.values, self.factor.values.sum(dim=(0, 1))))
def test_transpose(self): attrs = ['b', 'c', 'a'] tr = self.factor.transpose(attrs) ans = Domain(attrs, [3, 4, 2]) self.assertEqual(tr.domain, ans)