def check(self, data, tol=0.001): """ Check the constraints using data across all geos (child) Inputs: data: multidimensional numpy array where the last dimension is geography tol: the tolerance for the check Outputs: check_list: a list of bools indicating if the constraint was passed for each georaphy in the indices """ data_list = np_utils.sliceArray(data) check_list = [] for i, index in enumerate(self.indices): constraint = Constraint(self.query, self.rhsList[i], self.sign) check_list.append(constraint.check(data_list[index], tol=tol)) return check_list
def addConstraint(stacked_constraint: StackedConstraint, model, parent_mask, two_d_vars, rounder=False, child_floor=None): """ Adds stacked constraints to the model Inputs: stacked_constraint: StackedConstraint object (see constraints_dpqueries.py) model: gurobi model object parent_mask: numpy 1-d boolean array indicating (w/ True) the indexes which then correspond to the children that should be included as variables in the model two_d_vars: a two dimensional (variables per geography, number of child geographies) gurobi tuplelist variable object rounder: bool indicating if we are using the rounder function (rather than the l2 function) child_floor: a multidimensional numpy array used in the rounder """ import gurobipy as gb ASSERT_TYPE(model, gb.Model) matrix_rep = stacked_constraint.query.matrixRep()[:, parent_mask] answer_list = stacked_constraint.rhsList if rounder: child_floor_list = np_utils.sliceArray(child_floor) for counter, level in enumerate(stacked_constraint.indices): temp = stacked_constraint.query.answer(child_floor_list[level]) answer_list[counter] = answer_list[counter] - temp sense = maps.toGRBFromStr()[stacked_constraint.sign] # Some constraints only appear in a subset of children for i in range(len(answer_list[0])): xlist = matrix_rep.tocsr()[i, :].indices.tolist() for counter, j in enumerate(stacked_constraint.indices): expr = gb.LinExpr() for x in xlist: expr.add(two_d_vars[x, j]) model.addConstr(lhs=expr, sense=sense, rhs=answer_list[counter][i], name=stacked_constraint.name)
def geoimp_wrapper(*, config, parent_child_node, accum, min_schema=None): """ This function performs the Post-Processing Step for a generic parent to the Child geography. It is called from topdown_engine.py:topdown in a Spark map operation. It runs on the CORE and TASK nodes, not on the MASTER. So there is no das object! Inputs: config: configuration object parent_child_node: a (k,v) RDD with key being a geocode and value being a tuple of GeounitNode objects containing one parent and multiple children accum: spark accumulator object which tracks the number of solves that use the backup solve Output: children: a list of Node objects for each of the children, after post-processing """ # Make sure that the logger is set up on all the nodes clogging.setup(level=logging.INFO, syslog='True', syslog_address=(das_utils.getMasterIp(), C.SYSLOG_UDP)) parent: GeounitNode children: List[GeounitNode] parent, children = findParentChildNodes(parent_child_node) n_children = len(children) ####### # under cenrtain circumstances we can skip the gurobi optimization ####### # # Only 1 child if n_children == 1: children[0].syn = parent.syn return children if parent.syn.sum() == 0: for child in children: child.syn = sparse.multiSparse(np.zeros(parent.syn.shape)) return children ######### # resume code for gurobi optimization ######## # stack the dp arrays on top of one another, if only 1 child just expand the axis if parent.dp: if n_children > 1: noisy_child = np.stack( [asDense(child.dp.DPanswer) for child in children], axis=-1) else: noisy_child = np.expand_dims(asDense(children[0].dp.DPanswer), axis=len( children[0].dp.DPanswer.shape)) else: noisy_child = None noisy_child_weight = 1. / children[0].dp.Var if parent.dp else None # TODO: Maybe filtering out the detailed querty form node.dp_queries can be done neater dp_queries_comb = stackNodeProperties(children, lambda node: node.dp_queries, cons_dpq.StackedDPquery, lambda name: name != C.DETAILED) query_weights = map( lambda sdpq: 1. / sdpq.Var, dp_queries_comb ) # We can get actual variance for each query if we want constraints_comb = stackNodeProperties(children, lambda node: node.cons, cons_dpq.StackedConstraint) parent_hist = parent.getDenseSyn() parent_geocode = parent.geocode seq_opt = sequential_optimizers.L2PlusRounderWithBackup( das=None, config=config, parent=parent_hist, parent_shape=parent_hist.shape, NoisyChild=noisy_child, childGeoLen=n_children, DPqueries=dp_queries_comb, constraints=constraints_comb, NoisyChild_weight=noisy_child_weight, query_weights=query_weights, identifier=parent_geocode, min_schema=min_schema, stat_node=children[0]) l2_answer, int_answer, backup_solve_status = seq_opt.run() # slice off the combined child solution to make separate arrays for each child int_answer_list = np_utils.sliceArray(int_answer) l2_answer_list = np_utils.sliceArray(l2_answer) # check constraints for i, child in enumerate(children): child.syn = int_answer_list[i] constraintsCheck(child) # make sparse arrays for i, child in enumerate(children): child.syn = sparse.multiSparse(int_answer_list[i]) child.syn_unrounded = sparse.multiSparse(l2_answer_list[i]) if backup_solve_status is True: accum += 1 return children
def makeInputsAndRunOptimizer(children, config, min_schema, parent_hist, parent_shape, parent_geocode, optimizers, keep_debug_info=False, aian=False): """ Converts the data from nodes to the inputs taken by optimizer: multiarrays, StackedConstraints, StackedDPQueries etc., creates the optimizer, runs it, and puts the optimized answers back into the nodes This is called from: * geoimp_wrapper_root(). * geoimp_wrapper() :param optimizers: :param children: iterable (list or multiarray) of children noisy histograms (i.e. detailed query measurements, aka noisy counts) :param config: DAS config file :param min_schema: backup feasibility schema (reduced schema through which constraints can be expressed) :param parent_hist: optimized histogram of the parent node :param parent_shape: shape of the parent histogram (children have the same shape too) :param parent_geocode: parent geocode :param keep_debug_info: whether to delete DPqueries after optimization (they take a lot of space) and not include unrounded optimized data into the node :return: list of optimized children nodes and accumulator count of backup feasibility triggers """ if config.getboolean(section=CC.ENGINE, option="reset_dpq_weights", fallback=False): variances = [] for child in children: variances.extend(child.getAllVariances()) min_var = min(variances) children = [ child.setDPQVar(func=lambda v: v / min_var) for child in children ] # # This is to make sure that total constraint is not accidentially left on for AIAN and non-AIAN, but really should be taken care of in config # # Have to set up the total US population as invariant, and turn of State # if aian: # for child in children: # child.removeConstraintByName('total') child_groups = makeChildGroups(children) if aian else None # # This is to make sure that total constraint is not accidentially left on for AIAN and non-AIAN, but really should be taken care of in config # # Have to set up the total US population as invariant, and turn of State # if aian: # for child in children: # child.removeConstraintByName('total') # Get the stacked detailed dp_queries (if we've taken detailed measurements), as well as their weights. If only one child, just expand. noisy_child = np.stack( [child.stackDetailedDPAnswers(parent_shape) for child in children], axis=-1) if children[0].dp else None noisy_child_weights = [child.detailedWeight() for child in children] constraints_comb = stackNodeProperties(children, lambda node: node.cons, cons_dpq.StackedConstraint) dp_queries_comb = [] # A loop over histograms. Each iteration goes over children (stackNodeProperties does that) and gets the dp_queries dict # corresponding to that histogram and stacks them for i in range(len(parent_shape)): dp_queries_comb.append( stackNodeProperties(children, lambda node: node.querySets2Stack()[i], cons_dpq.StackedDPquery)) # TODO: Note that multipass rounder queries only support the main histogram currently (hence no loop below). # May be necessary for full-scale DHCH to expand this to support the full histogram rounder_queries_comb = [ stackNodeProperties(children, lambda node: node.rounder_queries, cons_dpq.StackedQuery) ] opt_dict = { "Cons": stackNodeProperties(children, lambda node: node.opt_dict["Cons"], cons_dpq.StackedConstraint), "npass_info": children[0].opt_dict["npass_info"], } if children[0].opt_dict is not None else None sequential_optimizers_dict = { CC.L2_PLUS_ROUNDER_WITH_BACKUP: sequential_optimizers.L2PlusRounderWithBackup, CC.L2_PLUS_ROUNDER_WITH_BACKUP_INTERLEAVED: sequential_optimizers.L2PlusRounderWithBackup_interleaved, } seq_opt_name, l2_opt, rounder = optimizers seq_opt_cls = sequential_optimizers_dict[seq_opt_name] try: l2c2o = children[0].query_ordering[CC.L2_CONSTRAIN_TO_QUERY_ORDERING] except KeyError: l2c2o = None # Create an appropriate sequential optimizer object seq_opt = seq_opt_cls( identifier=parent_geocode, child_geolevel=children[0].geolevel, parent=parent_hist, parent_shape=parent_shape, childGeoLen=len(children), constraints=constraints_comb, NoisyChild=noisy_child, noisy_child_weights=noisy_child_weights, DPqueries=dp_queries_comb, rounder_queries=rounder_queries_comb, min_schema=(min_schema, False), child_groups=child_groups, opt_dict=opt_dict, L2_DPqueryOrdering=children[0].query_ordering[CC.L2_QUERY_ORDERING], L2_Constrain_to_Ordering=l2c2o, Rounder_DPqueryOrdering=children[0].query_ordering[ CC.ROUNDER_QUERY_ORDERING], optimizers=(l2_opt, rounder), das=None, config=config) l2_answer, int_answer, backup_solve_status = seq_opt.run() # Slice off the combined child solution to make lists of ndarrays, with one element for each child int_answer_list = np_utils.sliceArray(int_answer[0]) unit_int_answer_list = np_utils.sliceArray(int_answer[1]) l2_answer_list = np_utils.sliceArray(l2_answer[0]) for i, child in enumerate(children): child.syn = int_answer_list[i] child.unit_syn = unit_int_answer_list[i] constraintsCheck(children) # Convert to sparse arrays for efficiency for i, child in enumerate(children): child.syn = sparse.multiSparse(int_answer_list[i]) child.unit_syn = sparse.multiSparse(unit_int_answer_list[i]) if keep_debug_info: child.syn_unrounded = sparse.multiSparse(l2_answer_list[i]) else: child.dp_queries.clear() return children, backup_solve_status