Example #1
0
    def test_binary(self):
        dom = Domain(['b', 'd', 'e'], [3, 5, 6])
        vals = torch.rand(3, 5, 6)
        factor = Factor(dom, vals)

        res = self.factor * factor
        ans = Domain(['a', 'b', 'c', 'd', 'e'], [2, 3, 4, 5, 6])
        self.assertEqual(res.domain, ans)

        res = self.factor + factor
        self.assertEqual(res.domain, ans)

        res = self.factor * 2.0
        self.assertEqual(res.domain, self.factor.domain)

        res = self.factor + 2.0
        self.assertEqual(res.domain, self.factor.domain)

        res = self.factor - 2.0
        self.assertEqual(res.domain, self.factor.domain)

        res = self.factor.exp().log()
        self.assertEqual(res.domain, self.factor.domain)
        self.assertTrue(np.allclose(res.datavector(),
                                    self.factor.datavector()))
Example #2
0
    def __init__(self, dataset, specs):
        self.dataset = dataset
        self.specs = json.load(open(specs, 'r'))
        domain_info = json.load(open('domain.json'))

        # check consistency for codebook information
        for col in list(domain_info):
            if domain_info[col][-1] < self.specs[col]['maxval']:
                print('Codebook inconsistent for', col)
                del domain_info[col]

        ## look at ground truth data to obtain possible values for state-dependent columns
        df = pd.read_csv(dataset)
        for col in ['SEA', 'METAREA', 'COUNTY', 'CITY', 'METAREAD']:
            domain_info[col] = sorted(df[col].unique())
        ## done using ground truth data 

        domain = { }
        for col in self.specs:
            if col in domain_info:
                domain[col] = len(domain_info[col])
            else:
                domain[col] = self.specs[col]['maxval'] + 1

        domain['INCWAGE_A'] = 52
        domain['INCWAGE_B'] = 8
        del domain['INCWAGE']
        #domain['INCWAGE'] = 5002
        domain['VALUEH'] = 5003
        
        self.domain_info = domain_info 
        self.domain = Domain.fromdict(domain)
Example #3
0
 def setUp(self):
     if skip: raise unittest.SkipTest('PyTorch not installed')
     attrs = ['a', 'b', 'c']
     shape = [2, 3, 4]
     domain = Domain(attrs, shape)
     values = torch.rand(*shape)
     self.factor = Factor(domain, values)
Example #4
0
    def test_expand(self):
        domain = Domain(['a', 'b', 'c', 'd'], [2, 3, 4, 5])
        res = self.factor.expand(domain)
        self.assertEqual(res.domain, domain)
        self.assertEqual(res.values.shape, domain.shape)

        res = res.sum(['d']) * 0.2
        self.assertTrue(torch.allclose(res.values, self.factor.values))
Example #5
0
    def test_project(self):
        res = self.factor.project(['c', 'a'], agg='sum')
        ans = Domain(['c', 'a'], [4, 2])
        self.assertEqual(res.domain, ans)
        self.assertEqual(res.values.shape, (4, 2))

        res = self.factor.project(['c', 'a'], agg='logsumexp')
        self.assertEqual(res.domain, ans)
        self.assertEqual(res.values.shape, (4, 2))
Example #6
0
    def load(path, domain):
        """ Load data into a dataset object

        :param path: path to csv file
        :param domain: path to json file encoding the domain information
        """
        df = pd.read_csv(path)
        config = json.load(open(domain))
        domain = Domain(config.keys(), config.values())
        return Dataset(df, domain)
Example #7
0
def reverse_data(data, supports):
    df = data.df.copy()
    newdom = {}
    for col in data.domain:
        support = supports[col]
        mx = support.sum()
        newdom[col] = int(support.size)
        idx, extra = np.where(support)[0], np.where(~support)[0]
        mask = df[col] == mx
        if extra.size == 0:
            pass
        else:
            df.loc[mask, col] = np.random.choice(extra, mask.sum())
        df.loc[~mask, col] = idx[df.loc[~mask, col]]
    newdom = Domain.fromdict(newdom)
    return Dataset(df, newdom)
Example #8
0
    def postprocess(self):
        #use noisy measurements to fit PGM inference
        #and generate synthetic data
        iters = self.iters
        domain = self.domain
        temp_domain = Domain.fromdict(domain)
        engine = FactoredInference(temp_domain,
                                   structural_zeros=None,
                                   iters=10000,
                                   log=True,
                                   warm_start=False,
                                   elim_order=self.elimination_order)
        self.engine = engine
        engine.estimate(self.measurements)

        self.synthetic = self.engine.model.synthetic_data()
        self.synthetic = reverse_data(self.synthetic, self.supports)
Example #9
0
    def synthesize(self, file_path, eps, seed):
        # setup random state
        prng = np.random.RandomState(seed)

        # load data vector
        relation = Relation(self.config)
        relation.load_csv(file_path)
        self._numerize(relation._df)

        # perform measurement
        attributes = [field_name for field_name in self.config.keys()]
        measurements = []
        w_sum = sum(Ai.weight for Ai in self.strategy.matrices)
        for Ai in self.strategy.matrices:
            w = Ai.weight
            proj = [
                attributes[i] for i, B in enumerate(Ai.base.matrices)
                if type(B).__name__ != 'Ones'
            ]
            matrix = [
                B for B in Ai.base.matrices if type(B).__name__ != 'Ones'
            ]
            matrix = EkteloMatrix(np.ones(
                (1, 1))) if len(matrix) == 0 else Kronecker(matrix)
            proj_rel = copy.deepcopy(relation)
            proj_rel.project(proj)
            if proj_rel.df.shape[1] == 0:
                x = np.array([proj_rel.df.shape[0]])
            else:
                x = Vectorize('').transform(proj_rel).flatten()
            y = Laplace(matrix, w * eps / w_sum).measure(x, prng)
            measurements.append((matrix.sparse_matrix(), y, 1.0 / w, proj))

        # generate synthetic data
        sizes = [field['bins'] for field in self.config.values()]
        dom = Domain(attributes, sizes)
        engine = FactoredInference(dom)
        model = engine.estimate(measurements)
        df = model.synthetic_data().df
        self._denumerize(df)
        self._sample_numerical(df)

        return df
Example #10
0
def transform_data(data, supports):
    df = data.df.copy()
    newdom = {}
    for col in data.domain:
        support = supports[col]
        size = support.sum()
        newdom[col] = int(size)
        if size < support.size:
            newdom[col] += 1
        mapping = {}
        idx = 0
        for i in range(support.size):
            mapping[i] = size
            if support[i]:
                mapping[i] = idx
                idx += 1
        assert idx == size
        df[col] = df[col].map(mapping)
    newdom = Domain.fromdict(newdom)
    return Dataset(df, newdom)
Example #11
0
def randomKway(name,
               number,
               marginal,
               proj=None,
               seed=0,
               filter=None,
               root_path='./',
               args=None):
    check_size = name in ['adult_orig', 'loans']
    path = os.path.join(root_path, "Datasets/{}.csv".format(name))
    df = pd.read_csv(path)

    domain = os.path.join(root_path, "Datasets/{}-domain.json".format(name))
    config = json.load(open(domain))
    domain = Domain(config.keys(), config.values())

    if name == 'adult':
        if args.adult_seed is not None:
            prng = np.random.RandomState(args.adult_seed)
            mask = prng.binomial(1, 0.9, size=len(df))
            df.loc[:, '_split'] = mask
        else:
            df.loc[:, '_split'] = 1

    if filter is not None:
        col, val = filter
        df = df[df[col] == val].reset_index(drop=True)
        del df[col]

    domain_max = max(domain.config.values())
    dtype = get_min_dtype(domain_max)
    df = df.astype(dtype)

    data = Dataset(df, domain)
    if proj is not None:
        data = data.project(proj)
    return data, randomKwayData(data,
                                number,
                                marginal,
                                seed,
                                check_size=check_size)
Example #12
0
 def krondot(self, matrices):
     """ Compute the answer to the set of queries Q1 x Q2 X ... x Qd, where 
         Qi is a query matrix on the ith attribute and "x" is the Kronecker product
     This may be more efficient than computing a supporting marginal then multiplying that by Q.
     In particular, if each Qi has only a few rows.
     
     :param matrices: a list of matrices for each attribute in the domain
     :return: the vector of query answers
     """
     assert all(M.shape[1] == n for M, n in zip(matrices, self.domain.shape)), \
         'matrices must conform to the shape of the domain'
     logZ = self.belief_propagation(self.potentials, logZ=True)
     factors = [self.potentials[cl].exp() for cl in self.cliques]
     Factor = type(factors[0])  # infer the type of the factors
     elim = self.domain.attrs
     for attr, Q in zip(elim, matrices):
         d = Domain(['%s-answer' % attr, attr], Q.shape)
         factors.append(Factor(d, Q))
     result = variable_elimination(factors, elim)
     result = result.transpose(['%s-answer' % a for a in elim])
     return result.datavector(flatten=False) * self.total / np.exp(logZ)
Example #13
0
from hdmm.templates import DefaultKron, Marginals, DefaultUnionKron
from hdmm import workload
from mbi import FactoredInference, Domain, Dataset
import numpy as np
from IPython import embed

# set up domain and workload
attributes = [
    'A', 'B', 'C'
]  #  should be the names of the columns, for now just using 0 and 1
sizes = [32, 32, 32]
dom = Domain(attributes, sizes)
#W = workload.Prefix2D(32)
W = workload.DimKMarginals(sizes, 1)
data = Dataset.synthetic(dom, 1000)

# optimize strategy using HDMM
#template = DefaultKron(sizes)
#template = Marginals(sizes)
template = DefaultUnionKron(sizes, 3)
template.optimize(W)
A = template.strategy()


def take_measurements(A, data):
    """ Efficiently take measurements from HDMM strategy and convert to a PGM-compatable form """
    A = workload.union_kron_canonical(A)
    measurements = []
    for Ai in A.matrices:
        w = Ai.weight
        proj = [
Example #14
0
from mbi import Dataset, FactoredInference, Domain
import numpy as np

# discrete domain with attributes A, B, C and corresponding size 4 x 5 x 6 
domain = Domain(['A','B','C'], [2, 3, 4])

# synthetic dataset with 1000 rows
data = Dataset.synthetic(domain, 1000) 

# project data onto subset of cols, and vectorize
ab = data.project(['A','B']).datavector()
bc = data.project(['B','C']).datavector()

# add noise to preserve differential privacy
epsilon = np.sqrt(2)
sigma = np.sqrt(2.0) / epsilon

np.random.seed(0)
yab = ab + np.random.laplace(loc=0, scale=sigma, size=ab.size)
ybc = bc + np.random.laplace(loc=0, scale=sigma, size=bc.size)

# record the measurements in a form needed by inference
Iab = np.eye(ab.size)
Ibc = np.eye(bc.size)

measurements = [(Iab, yab, sigma, ['A', 'B']),
                (Ibc, ybc, sigma, ['B', 'C'])]

# estimate the data distribution
engine = FactoredInference(domain)
model = engine.estimate(measurements, engine='MD')
Example #15
0
 def test_logsumexp(self):
     res = self.factor.logsumexp(['a', 'c'])
     values = self.factor.values
     ans = torch.log(torch.sum(torch.exp(values), dim=(0, 2)))
     self.assertEqual(res.domain, Domain(['b'], [3]))
     self.assertTrue(torch.allclose(res.values, ans))
Example #16
0
 def test_sum(self):
     res = self.factor.sum(['a', 'b'])
     self.assertEqual(res.domain, Domain(['c'], [4]))
     self.assertTrue(
         torch.allclose(res.values, self.factor.values.sum(dim=(0, 1))))
Example #17
0
 def test_transpose(self):
     attrs = ['b', 'c', 'a']
     tr = self.factor.transpose(attrs)
     ans = Domain(attrs, [3, 4, 2])
     self.assertEqual(tr.domain, ans)