def __init__(self, dataset, specs): self.dataset = dataset self.specs = json.load(open(specs, 'r')) domain_info = json.load(open('domain.json')) # check consistency for codebook information for col in list(domain_info): if domain_info[col][-1] < self.specs[col]['maxval']: print('Codebook inconsistent for', col) del domain_info[col] ## look at ground truth data to obtain possible values for state-dependent columns df = pd.read_csv(dataset) for col in ['SEA', 'METAREA', 'COUNTY', 'CITY', 'METAREAD']: domain_info[col] = sorted(df[col].unique()) ## done using ground truth data domain = { } for col in self.specs: if col in domain_info: domain[col] = len(domain_info[col]) else: domain[col] = self.specs[col]['maxval'] + 1 domain['INCWAGE_A'] = 52 domain['INCWAGE_B'] = 8 del domain['INCWAGE'] #domain['INCWAGE'] = 5002 domain['VALUEH'] = 5003 self.domain_info = domain_info self.domain = Domain.fromdict(domain)
def reverse_data(data, supports): df = data.df.copy() newdom = {} for col in data.domain: support = supports[col] mx = support.sum() newdom[col] = int(support.size) idx, extra = np.where(support)[0], np.where(~support)[0] mask = df[col] == mx if extra.size == 0: pass else: df.loc[mask, col] = np.random.choice(extra, mask.sum()) df.loc[~mask, col] = idx[df.loc[~mask, col]] newdom = Domain.fromdict(newdom) return Dataset(df, newdom)
def postprocess(self): #use noisy measurements to fit PGM inference #and generate synthetic data iters = self.iters domain = self.domain temp_domain = Domain.fromdict(domain) engine = FactoredInference(temp_domain, structural_zeros=None, iters=10000, log=True, warm_start=False, elim_order=self.elimination_order) self.engine = engine engine.estimate(self.measurements) self.synthetic = self.engine.model.synthetic_data() self.synthetic = reverse_data(self.synthetic, self.supports)
def transform_data(data, supports): df = data.df.copy() newdom = {} for col in data.domain: support = supports[col] size = support.sum() newdom[col] = int(size) if size < support.size: newdom[col] += 1 mapping = {} idx = 0 for i in range(support.size): mapping[i] = size if support[i]: mapping[i] = idx idx += 1 assert idx == size df[col] = df[col].map(mapping) newdom = Domain.fromdict(newdom) return Dataset(df, newdom)