def makeNode(self, persons, geocode='0123456789abcdef'): person_hist, unit_hist = table2hists(np.array(persons), self.schema, housing_varname=CC.ATTR_HHGQ) invar = InvariantsMaker.make(schema=CC.DAS_DHCP_HHGQ, raw=person_hist, raw_housing=unit_hist, invariant_names=('tot', 'gqhh_tot', 'gqhh_vect')) cons = PConstraintsCreator( hist_shape=(person_hist.shape, unit_hist.shape), invariants=invar, constraint_names=( 'hhgq_total_lb', 'hhgq_total_ub', 'nurse_nva_0')).calculateConstraints().constraints_dict node = GeounitNode(raw=person_hist, raw_housing=unit_hist, invar=invar, cons=cons, geocode_dict={ 16: 'Block', 12: 'Block_Group', 11: 'Tract', 5: 'County', 2: 'State', 1: 'US' }, geocode=geocode) node.syn = node.raw return node
def makeNode(self, hholds, units, geocode='0'): hhold_hist, unit_hist = table2hists( np.array(hholds), self.schema), table2hists(np.array(units), self.unit_schema, CC.ATTR_HHGQ, units=True) invar = InvariantsMaker.make(schema=CC.SCHEMA_HOUSEHOLD2010, raw=hhold_hist, raw_housing=unit_hist, invariant_names=('tot', 'gqhh_vect')) cons = HHConstraintsCreator( hist_shape=(hhold_hist.shape, unit_hist.shape), invariants=invar, constraint_names=('no_vacant', 'living_alone', 'size2')).calculateConstraints().constraints_dict node = GeounitNode(raw=hhold_hist, raw_housing=unit_hist, invar=invar, cons=cons, geocode_dict={1: 'Stub'}, geocode=geocode) node.syn = node.raw return node
def sample_histogram(node: GeounitNode, sample_target: int): """ :param node: The input GeounitNode which will receive a new sampled histogram :param sample_target: The size of the target sample population :return: The input node with its syn attribute set to the sampled histogram """ assert all([ node.raw is not None, isinstance(node.raw, multiSparse), node.raw.sparse_array is not None, node.raw.sparse_array.data is not None ]) # Grab the sparse data array from the node to do work on directly # This is in the format of a 1D array data_shape = node.raw.shape # Get the shape and indices of populated values in the sparse matrix to be able # to recreate a new one csr_shape = node.raw.sparse_array.shape indices = node.raw.sparse_array.indices indptr = node.raw.sparse_array.indptr # Get the probability vector pval = BootstrapEngine.compute_pval(node) # Sample from a multinomial of the pval sampled_data = numpy.random.multinomial(sample_target, pval) # Produce the new CSR matrix and histogram new_matrix = ss.csr_matrix((sampled_data, indices, indptr), shape=csr_shape) new_histogram: __HistData__ = multiSparse(new_matrix, shape=data_shape) # Set the node's syn attribute node.syn = new_histogram return node
def geoimp_wrapper_nat(*, config, parent_shape, nat_node: GeounitNode, min_schema=None): """ This function performs the Post-Processing Step of National to National level. It is called from engine_utils.py:topdown in a Spark map operation Inputs: config: configuration object nat_node: a GeounitNode object referring to entire nation Output: nat_node: a GeounitNode object referring to entire nation """ # Make sure that the logger is set up on all of the nodes clogging.setup(level=logging.INFO, syslog=True, syslog_address=(das_utils.getMasterIp(), C.SYSLOG_UDP)) # t_start = time.time() parent_hist = None noisy_child = np.expand_dims( asDense(nat_node.dp.DPanswer), axis=len( nat_node.dp.DPanswer.shape)) if nat_node.dp else None noisy_child_weight = 1. / nat_node.dp.Var if nat_node.dp else None parent_geocode = "nat_to_nat" # TODO: Maybe filtering out the detailed querty form node.dp_queries can be done neater dp_queries_comb = stackNodeProperties([ nat_node, ], lambda node: node.dp_queries, cons_dpq.StackedDPquery, lambda name: name != C.DETAILED) query_weights = map( lambda sdpq: 1. / sdpq.Var, dp_queries_comb ) # We can get actual variance for each query if we want constraints_comb = stackNodeProperties([ nat_node, ], lambda node: node.cons, cons_dpq.StackedConstraint) # Create an L2PlusRounderWithBackup object seq_opt = sequential_optimizers.L2PlusRounderWithBackup( das=None, parent=parent_hist, parent_shape=parent_shape, NoisyChild=noisy_child, childGeoLen=1, config=config, DPqueries=dp_queries_comb, constraints=constraints_comb, NoisyChild_weight=noisy_child_weight, query_weights=query_weights, identifier="nat_to_nat", min_schema=min_schema, stat_node=nat_node) l2_answer, int_answer, backup_solve_status = seq_opt.run() # get rid of extra dimension int_answer = int_answer.squeeze() l2_answer = l2_answer.squeeze() nat_node.syn = int_answer constraintsCheck(nat_node, parent_geocode) nat_node.syn = sparse.multiSparse(int_answer) nat_node.syn_unrounded = sparse.multiSparse(l2_answer) return nat_node
def conform2PL94(node: GeounitNode): DP_counts = node.getDenseSyn() PL94_counts = node.invar['pl94counts'] node.syn = multiSparse( np.where(DP_counts > PL94_counts, PL94_counts, DP_counts)) return node