def copyParentSyn(self, parent, keep_debug_info, zerosyn=False): """ Set node SYN (optimized) histograms to those of the parent. Used when it's the only child. Also, when parent is empty, set the unrounded histogram :param parent: :param keep_debug_info: :param zerosyn: :return: """ if not zerosyn: self.syn = parent.syn self.unit_syn = parent.unit_syn else: # Set to zeros if parent is zero self.syn = multiSparse(np.zeros(parent.syn.shape, dtype=int)) self.unit_syn = multiSparse( np.zeros(parent.unit_syn.shape, dtype=int)) if not keep_debug_info: # dp_queries (noisy measurements) take a lot of space / memory if self.dp_queries: self.dp_queries.clear() if self.unit_dp_queries: self.unit_dp_queries.clear() else: # If we want to keep dp_queries (noisy measurements) and unrounded results self.syn_unrounded = parent.syn_unrounded return self
def getToyGeounitData_GeounitNode(schema, geocodes=[ '000', '001', '002', '003', '010', '011', '012', '020', '022' ], geocode_dict={ 3: 'block', 2: 'county' }, raw_params={ 'low': 0, 'high': 2 }, syn_params={ 'low': 0, 'high': 5 }): geounits = [] for geocode in du.aslist(geocodes): if raw_params is not None: raw = np.random.randint(low=raw_params['low'], high=raw_params['high'], size=schema.size).reshape(schema.shape) if syn_params is not None: syn = np.random.randint(low=syn_params['low'], high=syn_params['high'], size=schema.size).reshape(schema.shape) geounits.append( GeounitNode(geocode=geocode, geocode_dict=geocode_dict, raw=multiSparse(raw), syn=multiSparse(syn))) return geounits
def testdata_random_geounit_generator(geocode, schema, density=0.01, scale=10): raw_mat = np.round( ss.random(1, schema.size, format='csr', density=density) * scale) syn_mat = np.round( ss.random(1, schema.size, format='csr', density=density) * scale) raw_sparse = sp.multiSparse(raw_mat, schema.shape) syn_sparse = sp.multiSparse(syn_mat, schema.shape) return {'geocode': geocode, 'raw': raw_sparse, 'syn': syn_sparse}
def test_equal_float(datafloat): assert sparse.multiSparse(datafloat) == sparse.multiSparse(datafloat + 1e-6) shape = (6, 4, 5) size = np.prod(shape) a = np.arange(1, size + 1).reshape(shape).astype(np.float) a[0,0,0] = np.nan b = a + 1e-6 assert sparse.multiSparse(a) == sparse.multiSparse(b)
def n(): geocode_dict = {16: 'Block', 12: 'Block_Group', 11: 'Tract', 5: 'County'} histogram = sparse.multiSparse( np.array([[[[5, 0], [0, 4]], [[5, 0], [0, 4]]], [[[5, 0], [0, 4]], [[5, 0], [0, 4]]]])) housing_hist = sparse.multiSparse((np.array([0, 1, 1, 0, 0, 0, 7, 2]))) return nodes.GeounitNode(geocode='123456789abcdefg', geocode_dict=geocode_dict, raw=histogram, raw_housing=housing_hist)
def test_init(): good_array = np.array([[0, 1, 2], [2, 0, 4]]) bad_obj = {"a":"bad", "dict":"obj"} spar = multiSparse(good_array) assert spar.shape == (2, 3) assert isinstance(spar.sparse_array, ss.csr_matrix) assert spar.sparse_array.count_nonzero() == 4 try: bad_spar = multiSparse(bad_obj) assert False except TypeError: assert True
def makeBlockNode(self, person_unit_arrays): """ This function makes block nodes from person unit arrays for a given geocode. Inputs: config: a configuration object person_unit_arrays: a RDD of (geocode, arrays), where arrays are the tables defined in the config Output: block_node: a nodes.GeounitNode object for the given geocode """ geocode, arrays = person_unit_arrays # Assign arrays to table names in a dictionary {name:array} and fill in with zeros if array is non-existent assert len(arrays) == len(self.data_names) data_dict = { n: a if a is not None else np.zeros(self.shape_dict[n]).astype( int ) # TODO: Wonder if this creation of zeros takes too much time, maybe directly in multisparse? for n, a in zip(self.data_names, arrays) } # geocode is a tuple where the [1] entry is empty. We only want the [0] entry. geocode = geocode[0] logging.info(f"creating geocode: {geocode}") raw = sparse.multiSparse( data_dict[self.privacy_table_name], shape=self.shape_dict[self.privacy_table_name]) raw_housing = sparse.multiSparse( data_dict[self.constraint_table_name], shape=self.shape_dict[self.constraint_table_name]) # Make Invariants invariants_dict = self.setup.makeInvariants( raw=raw, raw_housing=raw_housing, invariant_names=self.invar_names) # Make Constraints constraints_dict = self.setup.makeConstraints( hist_shape=(self.setup.hist_shape, self.setup.unit_hist_shape), invariants=invariants_dict, constraint_names=self.cons_names) block_node = nodes.GeounitNode(geocode=geocode, geocode_dict=self.modified_geocode_dict, raw=raw, raw_housing=raw_housing, cons=constraints_dict, invar=invariants_dict) return block_node
def test_toDense(): good_array = np.array([[[0, 1], [2, 0], [0, 3]], [[4, 0], [0, 5], [6, 0]]]) spar = multiSparse(good_array) assert spar.shape == (2, 3, 2) undo = spar.toDense() assert (undo == good_array).all()
def data(spark): geocodeDict = {16: 'Block', 12: 'Block_Group', 11: 'Tract', 5: 'County'} bn1 = geounitNode(geocode='4400700010111000', raw=multiSparse(np.array([[1, 2], [3, 4]])), syn=multiSparse(np.array([[1, 1], [0, 7]])), geocodeDict=geocodeDict) bn2 = geounitNode(geocode='4400700010111001', raw=multiSparse(np.array([[3, 4], [2, 1]])), syn=multiSparse(np.array([[2, 2], [1, 0]])), geocodeDict=geocodeDict) sc = spark.sparkContext return sc.parallelize([bn1, bn2])
def minSchematize(node, array_dims, add_over_margins): minSchemaQuery = cenquery.Query(array_dims=array_dims, subset=None, add_over_margins=add_over_margins) node.raw = sparse.multiSparse( minSchemaQuery.answer_original(node.raw.toDense())) minSchema_shape = node.raw.shape dims_keep = [ x for x in set(range(len(array_dims))).difference(set(add_over_margins)) ] constraint_keys = node.cons.keys() for key in constraint_keys: node.cons[key].query.array_dims = minSchema_shape node.cons[key].query.add_over_margins = tuple([ x for x in set(dims_keep).intersection( set(node.cons[key].query.add_over_margins)) ]) node.cons[key].query.subset_input = [ node.cons[key].query.subset_input[x] for x in dims_keep ] node.cons[key].query.subset = np.ix_( *tuple(node.cons[key].query.subset_input)) #axis_groupings = () ?? currently no axis groupings in constraints print(node.cons[key].query) node.cons[key].check_after_update() return node
def test_add2(): good_array_A = np.array([[0, 1], [2, 3]]) good_array_B = np.array([[3, 2], [1, 0]]) bad_array_C = np.array([[2, 4, 4]]) spar_A = multiSparse(good_array_A) spar_B = multiSparse(good_array_B) spar_C = multiSparse(bad_array_C) spar = spar_A + spar_B assert spar.shape == (2, 2) assert spar.sparse_array.count_nonzero() == 4 assert (spar.toDense() != spar_A.toDense()).any() assert (spar.toDense() == np.array([[3, 3], [3, 3]])).all() try: bad_spar = spar_A + spar_B assert False except AssertionError: assert True
def Data(self): """ Data in the shape of histograms for 1 Block. Hist shape (2,) (for, e.g., Male, Female). """ b1 = multiSparse(np.array([1, 2])) block_nodes = [ GeounitNode('b1', raw=b1, raw_housing=b1, invar={}, cons={}, geocode_dict={2: "Block"}), ] return block_nodes
def agg_func(config, parent_child_node): """ This function takes a set of parent and child nodes, aggregates the children syn histograms and replaces the parent.syn with the aggregation. Inputs: config: the config object parent_child_node: a list of a parent and it's children nodes Outputs: parent: the parent node """ parent_child_node = list(parent_child_node) parent_geocode = parent_child_node[0] # a list of the node objects nodes = list(list(parent_child_node)[1]) #calculate the length of each of the geocodes (to determine which is the parent) geocode_lens = [len(node.geocode) for node in nodes] #the parent is the shortest geocode parent = nodes[np.argmin(geocode_lens)] #subset the children nodes children = nodes[:np.argmin(geocode_lens )] + nodes[np.argmin(geocode_lens) + 1:] children = sorted(children, key=lambda geocode_data: int(geocode_data.geocode)) child_geos = [child.geocode for child in children] parent.backup_solve = children[0].parent_backup_solve syn_agg = sparse.multiSparse(np.zeros(parent.syn.shape)) for child in children: syn_agg = syn_agg + child.syn parent.syn = syn_agg return parent
def sample_histogram(node: GeounitNode, sample_target: int): """ :param node: The input GeounitNode which will receive a new sampled histogram :param sample_target: The size of the target sample population :return: The input node with its syn attribute set to the sampled histogram """ assert all([ node.raw is not None, isinstance(node.raw, multiSparse), node.raw.sparse_array is not None, node.raw.sparse_array.data is not None ]) # Grab the sparse data array from the node to do work on directly # This is in the format of a 1D array data_shape = node.raw.shape # Get the shape and indices of populated values in the sparse matrix to be able # to recreate a new one csr_shape = node.raw.sparse_array.shape indices = node.raw.sparse_array.indices indptr = node.raw.sparse_array.indptr # Get the probability vector pval = BootstrapEngine.compute_pval(node) # Sample from a multinomial of the pval sampled_data = numpy.random.multinomial(sample_target, pval) # Produce the new CSR matrix and histogram new_matrix = ss.csr_matrix((sampled_data, indices, indptr), shape=csr_shape) new_histogram: __HistData__ = multiSparse(new_matrix, shape=data_shape) # Set the node's syn attribute node.syn = new_histogram return node
def mfUrData(self, setup_instance): """ Data in the shape of histograms for 3 Rural Blocks in 1 Rural county and 3 Urban blocks in 1 Urban county, all in 1 states Histogram is shape (2,) for sex, i.e. each block provides number of male and number of female. This is the same test example as in JavaScript simulator. """ rb1 = multiSparse(np.array([1, 2])) rb2 = multiSparse(np.array([3, 4])) rb3 = multiSparse(np.array([5, 6])) ub1 = multiSparse(np.array([101, 102])) ub2 = multiSparse(np.array([103, 104])) ub3 = multiSparse(np.array([105, 106])) block_nodes = [] for block, geocode in zip( [rb1, rb2, rb3, ub1, ub2, ub3], ['1RB1', '1RB2', '1RB3', '1UB1', '1UB2', '1UB3']): invariants = setup_instance.makeInvariants( raw=block, raw_housing=block, invariant_names=setup_instance.inv_con_by_level['Block'] ['invar_names']) constraints = setup_instance.makeConstraints( hist_shape=(2, ), invariants=invariants, constraint_names=setup_instance.inv_con_by_level['Block'] ['cons_names']) block_nodes.append( GeounitNode(geocode, raw=block, raw_housing=block, invar=invariants, cons=constraints, geocode_dict={ 4: "Block", 3: "County", 1: "State" })) return block_nodes
def test_makeAdditionalInvariantsConstraints(self, block_cons, state_cons, county_cons): class TestSetup(DASDecennialSetup): def __init__(self): self.hist_shape = (2, ) self.hist_vars = ("sex", ) self.validate_input_data_constraints = False self.inv_con_by_level = { 'Block': { 'invar_names': ('tot', ) if block_cons else (), 'cons_names': ('total', ) if block_cons else (), }, 'County': { 'invar_names': ('tot', ) if county_cons else (), 'cons_names': ('total', ) if county_cons else (), }, 'State': { 'invar_names': ('tot', ) if state_cons else (), 'cons_names': ('total', ) if state_cons else () } } @staticmethod def makeInvariants(raw, raw_housing, invariant_names): inv_dict = {} if 'tot' in invariant_names: inv_dict.update({'tot': np.sum(raw.toDense())}) return inv_dict @staticmethod def makeConstraints(hist_shape, invariants, constraint_names): cons_dict = {} if 'total' in constraint_names: cons_dict.update({ 'total': Constraint( MultiHistQuery((QueryFactory.makeTabularGroupQuery( (2, ), add_over_margins=(0, )), StubQuery( (2, 1), "stub")), (1, 0)), np.array(invariants['tot']), "=", "total") }) return cons_dict setup_instance = TestSetup() rb1 = sparse.multiSparse(np.array([1, 2])) rb2 = sparse.multiSparse(np.array([3, 4])) rb3 = sparse.multiSparse(np.array([5, 6])) ub1 = sparse.multiSparse(np.array([101, 102])) ub2 = sparse.multiSparse(np.array([103, 104])) ub3 = sparse.multiSparse(np.array([105, 106])) block_nodes = [] for block, geocode in zip( [rb1, rb2, rb3, ub1, ub2, ub3], ['1RB1', '1RB2', '1RB3', '1UB1', '1UB2', '1UB3']): invariants = setup_instance.makeInvariants( raw=block, raw_housing=block, invariant_names=setup_instance.inv_con_by_level['Block'] ['invar_names']) constraints = setup_instance.makeConstraints( hist_shape=(2, ), invariants=invariants, constraint_names=setup_instance.inv_con_by_level['Block'] ['cons_names']) block_nodes.append( GeounitNode(geocode, raw=block, raw_housing=block, invar=invariants, cons=constraints, geocode_dict={ 4: "Block", 3: "County", 1: "State" })) rc = block_nodes[0].addInReduce(block_nodes[1]).addInReduce( block_nodes[2]).shiftGeocodesUp() rc.makeAdditionalInvariantsConstraints(setup_instance) uc = block_nodes[3].addInReduce(block_nodes[4]).addInReduce( block_nodes[5]).shiftGeocodesUp() uc.makeAdditionalInvariantsConstraints(setup_instance) state = rc.addInReduce(uc).shiftGeocodesUp() state.makeAdditionalInvariantsConstraints(setup_instance) assert state.checkConstraints() assert rc.checkConstraints() assert uc.checkConstraints()
def geoimp_wrapper_nat(*, config, parent_shape, nat_node: GeounitNode, min_schema=None): """ This function performs the Post-Processing Step of National to National level. It is called from engine_utils.py:topdown in a Spark map operation Inputs: config: configuration object nat_node: a GeounitNode object referring to entire nation Output: nat_node: a GeounitNode object referring to entire nation """ # Make sure that the logger is set up on all of the nodes clogging.setup(level=logging.INFO, syslog=True, syslog_address=(das_utils.getMasterIp(), C.SYSLOG_UDP)) # t_start = time.time() parent_hist = None noisy_child = np.expand_dims( asDense(nat_node.dp.DPanswer), axis=len( nat_node.dp.DPanswer.shape)) if nat_node.dp else None noisy_child_weight = 1. / nat_node.dp.Var if nat_node.dp else None parent_geocode = "nat_to_nat" # TODO: Maybe filtering out the detailed querty form node.dp_queries can be done neater dp_queries_comb = stackNodeProperties([ nat_node, ], lambda node: node.dp_queries, cons_dpq.StackedDPquery, lambda name: name != C.DETAILED) query_weights = map( lambda sdpq: 1. / sdpq.Var, dp_queries_comb ) # We can get actual variance for each query if we want constraints_comb = stackNodeProperties([ nat_node, ], lambda node: node.cons, cons_dpq.StackedConstraint) # Create an L2PlusRounderWithBackup object seq_opt = sequential_optimizers.L2PlusRounderWithBackup( das=None, parent=parent_hist, parent_shape=parent_shape, NoisyChild=noisy_child, childGeoLen=1, config=config, DPqueries=dp_queries_comb, constraints=constraints_comb, NoisyChild_weight=noisy_child_weight, query_weights=query_weights, identifier="nat_to_nat", min_schema=min_schema, stat_node=nat_node) l2_answer, int_answer, backup_solve_status = seq_opt.run() # get rid of extra dimension int_answer = int_answer.squeeze() l2_answer = l2_answer.squeeze() nat_node.syn = int_answer constraintsCheck(nat_node, parent_geocode) nat_node.syn = sparse.multiSparse(int_answer) nat_node.syn_unrounded = sparse.multiSparse(l2_answer) return nat_node
def conform2PL94(node: GeounitNode): DP_counts = node.getDenseSyn() PL94_counts = node.invar['pl94counts'] node.syn = multiSparse( np.where(DP_counts > PL94_counts, PL94_counts, DP_counts)) return node
def test_square(dataint, datafloat): assert sparse.multiSparse(dataint).square() == sparse.multiSparse(np.square(dataint)) assert sparse.multiSparse(datafloat).square() == sparse.multiSparse(np.square(datafloat))
def test_sum(dataint, datafloat): assert sparse.multiSparse(dataint).sum() == np.sum(dataint) assert np.isclose(sparse.multiSparse(datafloat).sum(), np.sum(datafloat)) assert np.array_equal(sparse.multiSparse(dataint).sum(dims = (1,2)), dataint.sum((1,2))) assert np.isclose(sparse.multiSparse(datafloat).sum(dims=(1, 2)), datafloat.sum((1, 2))).all()
def makeInputsAndRunOptimizer(children, config, min_schema, parent_hist, parent_shape, parent_geocode, optimizers, keep_debug_info=False, aian=False): """ Converts the data from nodes to the inputs taken by optimizer: multiarrays, StackedConstraints, StackedDPQueries etc., creates the optimizer, runs it, and puts the optimized answers back into the nodes This is called from: * geoimp_wrapper_root(). * geoimp_wrapper() :param optimizers: :param children: iterable (list or multiarray) of children noisy histograms (i.e. detailed query measurements, aka noisy counts) :param config: DAS config file :param min_schema: backup feasibility schema (reduced schema through which constraints can be expressed) :param parent_hist: optimized histogram of the parent node :param parent_shape: shape of the parent histogram (children have the same shape too) :param parent_geocode: parent geocode :param keep_debug_info: whether to delete DPqueries after optimization (they take a lot of space) and not include unrounded optimized data into the node :return: list of optimized children nodes and accumulator count of backup feasibility triggers """ if config.getboolean(section=CC.ENGINE, option="reset_dpq_weights", fallback=False): variances = [] for child in children: variances.extend(child.getAllVariances()) min_var = min(variances) children = [ child.setDPQVar(func=lambda v: v / min_var) for child in children ] # # This is to make sure that total constraint is not accidentially left on for AIAN and non-AIAN, but really should be taken care of in config # # Have to set up the total US population as invariant, and turn of State # if aian: # for child in children: # child.removeConstraintByName('total') child_groups = makeChildGroups(children) if aian else None # # This is to make sure that total constraint is not accidentially left on for AIAN and non-AIAN, but really should be taken care of in config # # Have to set up the total US population as invariant, and turn of State # if aian: # for child in children: # child.removeConstraintByName('total') # Get the stacked detailed dp_queries (if we've taken detailed measurements), as well as their weights. If only one child, just expand. noisy_child = np.stack( [child.stackDetailedDPAnswers(parent_shape) for child in children], axis=-1) if children[0].dp else None noisy_child_weights = [child.detailedWeight() for child in children] constraints_comb = stackNodeProperties(children, lambda node: node.cons, cons_dpq.StackedConstraint) dp_queries_comb = [] # A loop over histograms. Each iteration goes over children (stackNodeProperties does that) and gets the dp_queries dict # corresponding to that histogram and stacks them for i in range(len(parent_shape)): dp_queries_comb.append( stackNodeProperties(children, lambda node: node.querySets2Stack()[i], cons_dpq.StackedDPquery)) # TODO: Note that multipass rounder queries only support the main histogram currently (hence no loop below). # May be necessary for full-scale DHCH to expand this to support the full histogram rounder_queries_comb = [ stackNodeProperties(children, lambda node: node.rounder_queries, cons_dpq.StackedQuery) ] opt_dict = { "Cons": stackNodeProperties(children, lambda node: node.opt_dict["Cons"], cons_dpq.StackedConstraint), "npass_info": children[0].opt_dict["npass_info"], } if children[0].opt_dict is not None else None sequential_optimizers_dict = { CC.L2_PLUS_ROUNDER_WITH_BACKUP: sequential_optimizers.L2PlusRounderWithBackup, CC.L2_PLUS_ROUNDER_WITH_BACKUP_INTERLEAVED: sequential_optimizers.L2PlusRounderWithBackup_interleaved, } seq_opt_name, l2_opt, rounder = optimizers seq_opt_cls = sequential_optimizers_dict[seq_opt_name] try: l2c2o = children[0].query_ordering[CC.L2_CONSTRAIN_TO_QUERY_ORDERING] except KeyError: l2c2o = None # Create an appropriate sequential optimizer object seq_opt = seq_opt_cls( identifier=parent_geocode, child_geolevel=children[0].geolevel, parent=parent_hist, parent_shape=parent_shape, childGeoLen=len(children), constraints=constraints_comb, NoisyChild=noisy_child, noisy_child_weights=noisy_child_weights, DPqueries=dp_queries_comb, rounder_queries=rounder_queries_comb, min_schema=(min_schema, False), child_groups=child_groups, opt_dict=opt_dict, L2_DPqueryOrdering=children[0].query_ordering[CC.L2_QUERY_ORDERING], L2_Constrain_to_Ordering=l2c2o, Rounder_DPqueryOrdering=children[0].query_ordering[ CC.ROUNDER_QUERY_ORDERING], optimizers=(l2_opt, rounder), das=None, config=config) l2_answer, int_answer, backup_solve_status = seq_opt.run() # Slice off the combined child solution to make lists of ndarrays, with one element for each child int_answer_list = np_utils.sliceArray(int_answer[0]) unit_int_answer_list = np_utils.sliceArray(int_answer[1]) l2_answer_list = np_utils.sliceArray(l2_answer[0]) for i, child in enumerate(children): child.syn = int_answer_list[i] child.unit_syn = unit_int_answer_list[i] constraintsCheck(children) # Convert to sparse arrays for efficiency for i, child in enumerate(children): child.syn = sparse.multiSparse(int_answer_list[i]) child.unit_syn = sparse.multiSparse(unit_int_answer_list[i]) if keep_debug_info: child.syn_unrounded = sparse.multiSparse(l2_answer_list[i]) else: child.dp_queries.clear() return children, backup_solve_status
def geoimp_wrapper_nat(config, nat_node): """ This function performs the Post-Processing Step of National to National level. Inputs: config: configuration object nat_node: a geounitNode object referring to entire nation Output: nat_node: a geounitNode object referring to entire nation """ import programs.engine.geoimpgbopt as geoimpgbopt parent_hist = None NoisyChild = np.expand_dims(nat_node.dp.DPanswer, axis=len(nat_node.dp.DPanswer.shape)) NoisyChild_weight = 1 / nat_node.dp.Var child_geos = nat_node.geocode parent_geocode = "nat_to_nat" #what if DPqueries is empty {}? DPqueries = nat_node.dp_queries.values() if any(nat_node.cons) == False: constraints = None else: constraints = nat_node.cons.values() query_weights = [] # need to add a dimension for geography to the object for x in DPqueries: x.query.array_dims = NoisyChild.shape x.query.subset_input = tuple(list(x.query.subset_input) + [[0]]) x.query.subset = np.ix_(*x.query.subset_input) x.DPanswer = np.expand_dims(x.DPanswer, axis=len(x.DPanswer.shape)) x.check_after_update() weight = 1 / x.Var query_weights.append(weight) # if no DPqueries, change this to an empty list if any(DPqueries) == False: DPqueries = [] query_weights = None # need to add a dimension for geography to the object if constraints is not None: for x in constraints: x.query.array_dims = NoisyChild.shape x.query.subset_input = tuple(list(x.query.subset_input) + [[0]]) x.query.subset = np.ix_(*x.query.subset_input) x.rhs = np.expand_dims(x.rhs, axis=len(x.rhs.shape)) x.check_after_update() #this is the actual post-processing optimization step l2_answer, int_answer, backup_solve_status = geoimpgbopt.L2geoimp_wrapper( config=config, parent=parent_hist, NoisyChild=NoisyChild, NoisyChild_weight=NoisyChild_weight, DPqueries=DPqueries, query_weights=query_weights, constraints=constraints, identifier="nat_to_nat") if constraints is not None: check = True for x in constraints: check = bool(np.prod(x.check(int_answer)) * check) print("constraints are ", check, "for parent geocode ", parent_geocode) #get rid of extra dimension nat_node.syn = sparse.multiSparse(int_answer.squeeze()) nat_node.syn_unrounded = sparse.multiSparse(l2_answer.squeeze()) return (nat_node)
def make_block_node(self, person_unit_arrays): """ This function makes block nodes from person unit arrays for a given geocode. args: person_unit_arrays - a key, value pair of (geocode, arrays), where arrays are the histograms defined in the config returns: block_node - a nodes.geounitNode object for the given geocode """ geocode, arrays = person_unit_arrays arrays = list(arrays) gqhhvacs = arrays[1].astype(int) arrays[1] = arrays[1][:-1] # Assign arrays to table names in a dictionary and fill in with zeros if array is non-existent assert len(arrays) == len(self.data_names) data_dict = {n: a.astype(int) if a is not None else np.zeros(self.person_hist_dimensions).astype(int) for n,a in zip(self.data_names, arrays)} #for name in data_dict: #if data_dict[self.privacy_table_name] is None: # data_dict[self.privacy_table_name] = np.zeros(self.person_hist_dimensions).astype(int) # data_dict = {} # for i in range(len(arrays)): # data_dict[self.data_names[i]] = arrays[i].astype(int) if arrays[i] is not None else np.zeros(self.person_hist_dimensions).astype(int) # geocode is a tuple where the [1] entry is empty. We only want the [0] entry. geocode = geocode[0] logging.info("creating geocode: %s" % geocode) housing_table_name = self.housing_table_name privacy_table_name = self.privacy_table_name raw = sparse.multiSparse(data_dict[privacy_table_name].astype(int)) raw_housing = sparse.multiSparse(data_dict[housing_table_name].astype(int)) levels = tuple(self.config["geodict"]["geolevel_names"].split(",")) invar_names = tuple(self.config[CONSTRAINTS][THE_INVARIANTS+"."+levels[0]].split(",")) if invar_names == ("",): invariants_dict = {} else: invariants_dict = self.InvariantsCreator(raw=raw, raw_housing=raw_housing, invariant_names=invar_names).calculateInvariants().invariants_dict invariants_dict["gqhhvacs_vect"] = gqhhvacs #not used for constraints, but must be passed through. don't need to add hhvacs to node signature anymore this way. cons_names = tuple(self.config[CONSTRAINTS][THE_CONSTRAINTS+"."+levels[0]].split(",")) # Make Constraints if cons_names == ("",): constraints_dict = {} else: constraints_dict = self.ConstraintsCreator(hist_shape=data_dict[self.privacy_table_name].shape, invariants=invariants_dict, constraint_names=cons_names)\ .calculateConstraints().constraints_dict #raw = data_dict[self.privacy_table_name].astype(int) block_node = nodes.geounitNode(geocode=geocode, geocodeDict=self.geocodeDict, raw=raw, raw_housing=raw_housing, cons=constraints_dict, invar=invariants_dict) return block_node
def make_block_node(config, person_unit_arrays, dim): """ This function makes block nodes from person unit arrays for a given geocode. Inputs: config: a configuration object person_unit_arrays: a RDD of (geocode, arrays), where arrays are the tables defined in the config Output: block_node: a nodes.geounitNode object for the given geocode """ #import invariants_module (file, invariants_class_name) = config[CONSTRAINTS][INVARIANTS].rsplit(".", 1) invariants_module = __import__(file, fromlist=[invariants_class_name]) #import constraints_module (file, class_name) = config[CONSTRAINTS][CONSTRAINTS].rsplit(".", 1) constraints_module = __import__(file, fromlist=[class_name]) # Get the names of tables in person_unit_arrays. data_names = [config[READER][PTABLE]] + config[READER][CTABLES].split(",") geocode, arrays = person_unit_arrays data_dict = {} for i in range(len(arrays)): data_dict[data_names[i]] = arrays[i].astype( int) if arrays[i] is not None else np.zeros(dim).astype(int) # geocode is a tuple where the [1] entry is empty. We only want the [0] entry. geocode = geocode[0] logging.info("creating geocode: %s" % geocode) # Make Invariants invar_names = tuple(config[CONSTRAINTS][THEINVARIANTS].split(",")) invariants_dict = getattr(invariants_module, invariants_class_name)( data_dict=data_dict, invariant_names=invar_names).calculateInvariants().invariants_dict #invariants = {} #for name in invar_names: #dataset_name = config[CONSTRAINTS]["{}.{}".format(name, DATA)] #subset = eval(config[CONSTRAINTS]["{}.{}".format(name, SUBSET)]) #if config[CONSTRAINTS]["{}.{}".format(name, MARGINS)] == "None": #add_over_margins = None #else: #add_over_margins = tuple( #int(x) for x in config[CONSTRAINTS]["{}.{}".format(name, MARGINS)].split(",")) #query = cenquery.Query(array_dims=data_dict[dataset_name].shape, #subset=subset, add_over_margins=add_over_margins) #invariants[name] = np.array(query.answer(data_dict[dataset_name])).astype(int) # Make Constraints privacy_table_name = config[READER][PTABLE] cons_names = tuple(config[CONSTRAINTS]["theConstraints"].split(",")) hist_shape = data_dict[privacy_table_name].shape constraints_dict = getattr(constraints_module, class_name)( hist_shape=hist_shape, invariants=invariants_dict, constraint_names=cons_names).calculateConstraints().constraints_dict raw = data_dict[privacy_table_name].astype(int) block_node = nodes.geounitNode(geocode=geocode, config=config, raw=sparse.multiSparse(raw), cons=constraints_dict, invar=invariants_dict) return block_node
def test_abs(dataint, datafloat): assert sparse.multiSparse(dataint).abs() == sparse.multiSparse(np.abs(dataint)) assert sparse.multiSparse(datafloat).abs() == sparse.multiSparse(np.abs(datafloat))
def geoimp_wrapper(*, config, parent_child_node, accum, min_schema=None): """ This function performs the Post-Processing Step for a generic parent to the Child geography. It is called from topdown_engine.py:topdown in a Spark map operation. It runs on the CORE and TASK nodes, not on the MASTER. So there is no das object! Inputs: config: configuration object parent_child_node: a (k,v) RDD with key being a geocode and value being a tuple of GeounitNode objects containing one parent and multiple children accum: spark accumulator object which tracks the number of solves that use the backup solve Output: children: a list of Node objects for each of the children, after post-processing """ # Make sure that the logger is set up on all the nodes clogging.setup(level=logging.INFO, syslog='True', syslog_address=(das_utils.getMasterIp(), C.SYSLOG_UDP)) parent: GeounitNode children: List[GeounitNode] parent, children = findParentChildNodes(parent_child_node) n_children = len(children) ####### # under cenrtain circumstances we can skip the gurobi optimization ####### # # Only 1 child if n_children == 1: children[0].syn = parent.syn return children if parent.syn.sum() == 0: for child in children: child.syn = sparse.multiSparse(np.zeros(parent.syn.shape)) return children ######### # resume code for gurobi optimization ######## # stack the dp arrays on top of one another, if only 1 child just expand the axis if parent.dp: if n_children > 1: noisy_child = np.stack( [asDense(child.dp.DPanswer) for child in children], axis=-1) else: noisy_child = np.expand_dims(asDense(children[0].dp.DPanswer), axis=len( children[0].dp.DPanswer.shape)) else: noisy_child = None noisy_child_weight = 1. / children[0].dp.Var if parent.dp else None # TODO: Maybe filtering out the detailed querty form node.dp_queries can be done neater dp_queries_comb = stackNodeProperties(children, lambda node: node.dp_queries, cons_dpq.StackedDPquery, lambda name: name != C.DETAILED) query_weights = map( lambda sdpq: 1. / sdpq.Var, dp_queries_comb ) # We can get actual variance for each query if we want constraints_comb = stackNodeProperties(children, lambda node: node.cons, cons_dpq.StackedConstraint) parent_hist = parent.getDenseSyn() parent_geocode = parent.geocode seq_opt = sequential_optimizers.L2PlusRounderWithBackup( das=None, config=config, parent=parent_hist, parent_shape=parent_hist.shape, NoisyChild=noisy_child, childGeoLen=n_children, DPqueries=dp_queries_comb, constraints=constraints_comb, NoisyChild_weight=noisy_child_weight, query_weights=query_weights, identifier=parent_geocode, min_schema=min_schema, stat_node=children[0]) l2_answer, int_answer, backup_solve_status = seq_opt.run() # slice off the combined child solution to make separate arrays for each child int_answer_list = np_utils.sliceArray(int_answer) l2_answer_list = np_utils.sliceArray(l2_answer) # check constraints for i, child in enumerate(children): child.syn = int_answer_list[i] constraintsCheck(child) # make sparse arrays for i, child in enumerate(children): child.syn = sparse.multiSparse(int_answer_list[i]) child.syn_unrounded = sparse.multiSparse(l2_answer_list[i]) if backup_solve_status is True: accum += 1 return children
def test_to_list_from_sparse(): spar_obj = multiSparse(np.array([[2, 3, 0], [0, 0, 1]])) assert to_list_from_sparse(spar_obj) == [((0, 0), 2), ((0, 1), 3), ((1, 2), 1)]
def geoimp_wrapper(config, parent_child_node, accum): """ This function performs the Post-Processing Step for a generic parent to the Child geography. Inputs: config: configuration object parent_child_node: a collection of geounitNode objects containing one parent and multiple child accum: spark accumulator object Output: children: a collection of geounitNode objects for each of the children, after post-processing """ import programs.engine.geoimpgbopt as geoimpgbopt from itertools import compress parent_child_node = list(parent_child_node) parent_geocode = parent_child_node[0] print("parent geocode is", parent_geocode) # a list of the node objects nodes = list(list(parent_child_node)[1]) #calculate the length of each of the geocodes (to determine which is the parent) geocode_lens = [len(node.geocode) for node in nodes] #the parent is the shortest geocode parent = nodes[np.argmin(geocode_lens)] #subset the children nodes children = nodes[:np.argmin(geocode_lens )] + nodes[np.argmin(geocode_lens) + 1:] children = sorted(children, key=lambda geocode_data: int(geocode_data.geocode)) child_geos = [child.geocode for child in children] n_children = len(child_geos) #stack the dp arrays on top of one another, if only 1 child just expand the axis if n_children > 1: NoisyChild = np.stack([child.dp.DPanswer for child in children], axis=-1) else: NoisyChild = np.expand_dims(children[0].dp.DPanswer, axis=len(children[0].dp.DPanswer.shape)) #combine DPqueries without geography to combined DPqueries with geography #if no DPqueries, change this to an empty list if any(children[0].dp_queries) == False: DPqueries_comb = [] else: DPqueries = list(list(child.dp_queries.values()) for child in children) n_q = len(DPqueries[0]) DPqueries_comb = [] for i in range(n_q): subset_input = tuple( list(DPqueries[0][i].query.subset_input) + [range(NoisyChild.shape[-1])]) query = cenquery.Query( array_dims=NoisyChild.shape, subset=subset_input, add_over_margins=DPqueries[0][i].query.add_over_margins) q_answer = np.stack([DPquery[i].DPanswer for DPquery in DPqueries], axis=-1) DP_query = cenquery.DPquery(query=query, DPanswer=q_answer) DPqueries_comb.append(DP_query) #delete redundant union constraints #which gq cat are non-zero #combine cenquery.Constraint objects without geography to build combined cenquery.Constraint constraints_comb = [] #now children may have different constraints. only combine the ones that match. if any(children[0].cons) == False: constraints_comb = None else: all_keys = [] for child in children: all_keys.extend(list(child.cons.keys())) #subset to unique names constraint_keys = tuple(list(set(all_keys))) #children is a list of nodes for key in constraint_keys: #make a list of individual constraints for all children who have them #find which children have the key ind = [key in child.cons.keys() for child in children] #children_sub is subset of children with that key children_sub = list(compress(children, ind)) constraints = list(child.cons[key] for child in children_sub) #get the list of geos that have this constraint subset_geos = list(compress(range(NoisyChild.shape[-1]), ind)) subset_input = tuple( list(constraints[0].query.subset_input) + [ subset_geos, ]) query = cenquery.Query( array_dims=NoisyChild.shape, subset=subset_input, add_over_margins=constraints[0].query.add_over_margins) rhs = np.stack([con.rhs for con in constraints], axis=-1) constraint = cenquery.Constraint(query=query, rhs=rhs, sign=constraints[0].sign, name=constraints[0].name) constraints_comb.append(constraint) parent_hist = parent.syn.toDense() parent_geocode = parent.geocode parent_constraints = parent.cons #for checking purposes #this is the actual post-processing optimization step l2_answer, int_answer, backup_solve_status = geoimpgbopt.L2geoimp_wrapper( config=config, parent=parent_hist, NoisyChild=NoisyChild, DPqueries=DPqueries_comb, constraints=constraints_comb, identifier=parent_geocode, parent_constraints=parent_constraints) #check constraints if constraints_comb is not None: check = True for x in constraints_comb: check = bool(np.prod(x.check(int_answer)) * check) print("constraints are ", check, "for parent geocode ", parent_geocode) temps = [] for i in range(len(child_geos)): temp = int_answer[tuple( [ slice(0, int_answer.shape[x]) for x in range(len(int_answer.shape) - 1) ] + [slice(i, i + 1)] )] #this is really ugly - feel free to improve, trying to subset to each geography temp = temp.squeeze() #gets rid of dimensions of size 1 temps.append(temp) #do this for unrounded too temps2 = [] for i in range(len(child_geos)): temp2 = l2_answer[tuple( [ slice(0, l2_answer.shape[x]) for x in range(len(l2_answer.shape) - 1) ] + [slice(i, i + 1)] )] #this is really ugly - feel free to improve, trying to subset to each geography temp2 = temp2.squeeze() #gets rid of dimensions of size 1 temps2.append(temp2) for i, geocode in enumerate(child_geos): children[i].syn = sparse.multiSparse(temps[i]) children[i].syn_unrounded = sparse.multiSparse(temps2[i]) if backup_solve_status == True: accum += 1 return (children)
def sparse_data(data): """ set up sparse data for testing """ return sparse.multiSparse(data)
def test_sub(dataint, datafloat): assert sparse.multiSparse(dataint) - sparse.multiSparse(dataint) == sparse.multiSparse(dataint*0) assert sparse.multiSparse(datafloat) - sparse.multiSparse(datafloat) == sparse.multiSparse(datafloat * 0)