Beispiel #1
0
 def check(self, data, tol=0.001):
     """
     Check the constraints using data across all geos (child)
     Inputs:
         data: multidimensional numpy array where the last dimension is geography
         tol: the tolerance for the check
     Outputs:
         check_list: a list of bools indicating if the constraint was passed for each georaphy in the indices
     """
     data_list = np_utils.sliceArray(data)
     check_list = []
     for i, index in enumerate(self.indices):
         constraint = Constraint(self.query, self.rhsList[i], self.sign)
         check_list.append(constraint.check(data_list[index], tol=tol))
     return check_list
    def addConstraint(stacked_constraint: StackedConstraint,
                      model,
                      parent_mask,
                      two_d_vars,
                      rounder=False,
                      child_floor=None):
        """
        Adds stacked constraints to the model
        Inputs:
            stacked_constraint: StackedConstraint object (see constraints_dpqueries.py)
            model:  gurobi model object
            parent_mask: numpy 1-d boolean array indicating (w/ True) the indexes which then correspond to the children
                         that should be included as variables in the model
            two_d_vars: a two dimensional (variables per geography, number of child geographies) gurobi tuplelist variable object
            rounder: bool indicating if we are using the rounder function (rather than the l2 function)
            child_floor: a multidimensional numpy array used in the rounder
        """
        import gurobipy as gb
        ASSERT_TYPE(model, gb.Model)
        matrix_rep = stacked_constraint.query.matrixRep()[:, parent_mask]
        answer_list = stacked_constraint.rhsList
        if rounder:
            child_floor_list = np_utils.sliceArray(child_floor)
            for counter, level in enumerate(stacked_constraint.indices):
                temp = stacked_constraint.query.answer(child_floor_list[level])
                answer_list[counter] = answer_list[counter] - temp

        sense = maps.toGRBFromStr()[stacked_constraint.sign]
        # Some constraints only appear in a subset of children
        for i in range(len(answer_list[0])):
            xlist = matrix_rep.tocsr()[i, :].indices.tolist()
            for counter, j in enumerate(stacked_constraint.indices):
                expr = gb.LinExpr()
                for x in xlist:
                    expr.add(two_d_vars[x, j])
                model.addConstr(lhs=expr,
                                sense=sense,
                                rhs=answer_list[counter][i],
                                name=stacked_constraint.name)
def geoimp_wrapper(*, config, parent_child_node, accum, min_schema=None):
    """
    This function performs the Post-Processing Step for a generic parent to the Child geography.
    It is called from topdown_engine.py:topdown in a Spark map operation. 
    It runs on the CORE and TASK nodes, not on the MASTER.
    So there is no das object!
    
    Inputs:
        config: configuration object
        parent_child_node: a (k,v) RDD with key being a geocode and
            value being a tuple of GeounitNode objects containing one parent and multiple children
        accum: spark accumulator object which tracks the number of solves that use the backup solve

    Output:
        children: a list of Node objects for each of the children, after post-processing
    """

    # Make sure that the logger is set up on all the nodes
    clogging.setup(level=logging.INFO,
                   syslog='True',
                   syslog_address=(das_utils.getMasterIp(), C.SYSLOG_UDP))
    parent: GeounitNode
    children: List[GeounitNode]
    parent, children = findParentChildNodes(parent_child_node)

    n_children = len(children)

    #######
    # under cenrtain circumstances we can skip the gurobi optimization
    #######
    #
    # Only 1 child

    if n_children == 1:
        children[0].syn = parent.syn
        return children

    if parent.syn.sum() == 0:
        for child in children:
            child.syn = sparse.multiSparse(np.zeros(parent.syn.shape))
        return children

    #########
    # resume code for gurobi optimization
    ########
    # stack the dp arrays on top of one another, if only 1 child just expand the axis

    if parent.dp:
        if n_children > 1:
            noisy_child = np.stack(
                [asDense(child.dp.DPanswer) for child in children], axis=-1)
        else:
            noisy_child = np.expand_dims(asDense(children[0].dp.DPanswer),
                                         axis=len(
                                             children[0].dp.DPanswer.shape))
    else:
        noisy_child = None

    noisy_child_weight = 1. / children[0].dp.Var if parent.dp else None

    # TODO: Maybe filtering out the detailed querty form node.dp_queries can be done neater
    dp_queries_comb = stackNodeProperties(children,
                                          lambda node: node.dp_queries,
                                          cons_dpq.StackedDPquery,
                                          lambda name: name != C.DETAILED)
    query_weights = map(
        lambda sdpq: 1. / sdpq.Var, dp_queries_comb
    )  # We can get actual variance for each query if we want
    constraints_comb = stackNodeProperties(children, lambda node: node.cons,
                                           cons_dpq.StackedConstraint)
    parent_hist = parent.getDenseSyn()
    parent_geocode = parent.geocode

    seq_opt = sequential_optimizers.L2PlusRounderWithBackup(
        das=None,
        config=config,
        parent=parent_hist,
        parent_shape=parent_hist.shape,
        NoisyChild=noisy_child,
        childGeoLen=n_children,
        DPqueries=dp_queries_comb,
        constraints=constraints_comb,
        NoisyChild_weight=noisy_child_weight,
        query_weights=query_weights,
        identifier=parent_geocode,
        min_schema=min_schema,
        stat_node=children[0])

    l2_answer, int_answer, backup_solve_status = seq_opt.run()

    # slice off the combined child solution to make separate arrays for each child
    int_answer_list = np_utils.sliceArray(int_answer)
    l2_answer_list = np_utils.sliceArray(l2_answer)

    # check constraints
    for i, child in enumerate(children):
        child.syn = int_answer_list[i]
        constraintsCheck(child)

    # make sparse arrays
    for i, child in enumerate(children):
        child.syn = sparse.multiSparse(int_answer_list[i])
        child.syn_unrounded = sparse.multiSparse(l2_answer_list[i])

    if backup_solve_status is True:
        accum += 1

    return children
Beispiel #4
0
def makeInputsAndRunOptimizer(children,
                              config,
                              min_schema,
                              parent_hist,
                              parent_shape,
                              parent_geocode,
                              optimizers,
                              keep_debug_info=False,
                              aian=False):
    """
    Converts the data from nodes to the inputs taken by optimizer: multiarrays, StackedConstraints, StackedDPQueries etc.,
    creates the optimizer, runs it, and puts the optimized answers back into the nodes

    This is called from:
         * geoimp_wrapper_root().
         * geoimp_wrapper()
    :param optimizers:
    :param children: iterable (list or multiarray) of children noisy histograms (i.e. detailed query measurements, aka noisy counts)
    :param config: DAS config file
    :param min_schema: backup feasibility schema (reduced schema through which constraints can be expressed)
    :param parent_hist: optimized histogram of the parent node
    :param parent_shape: shape of the parent histogram (children have the same shape too)
    :param parent_geocode: parent geocode
    :param keep_debug_info: whether to delete DPqueries after optimization (they take a lot of space) and not include unrounded optimized data into the node
    :return: list of optimized children nodes and accumulator count of backup feasibility triggers
    """

    if config.getboolean(section=CC.ENGINE,
                         option="reset_dpq_weights",
                         fallback=False):
        variances = []
        for child in children:
            variances.extend(child.getAllVariances())
        min_var = min(variances)
        children = [
            child.setDPQVar(func=lambda v: v / min_var) for child in children
        ]

    # # This is to make sure that total constraint is not accidentially left on for AIAN and non-AIAN, but really should be taken care of in config
    # # Have to set up the total US population as invariant, and turn of State
    # if aian:
    #     for child in children:
    #         child.removeConstraintByName('total')

    child_groups = makeChildGroups(children) if aian else None

    # # This is to make sure that total constraint is not accidentially left on for AIAN and non-AIAN, but really should be taken care of in config
    # # Have to set up the total US population as invariant, and turn of State
    # if aian:
    #     for child in children:
    #         child.removeConstraintByName('total')

    # Get the stacked detailed dp_queries (if we've taken detailed measurements), as well as their weights. If only one child, just expand.
    noisy_child = np.stack(
        [child.stackDetailedDPAnswers(parent_shape) for child in children],
        axis=-1) if children[0].dp else None
    noisy_child_weights = [child.detailedWeight() for child in children]
    constraints_comb = stackNodeProperties(children, lambda node: node.cons,
                                           cons_dpq.StackedConstraint)
    dp_queries_comb = []
    # A loop over histograms. Each iteration goes over children (stackNodeProperties does that) and gets the dp_queries dict
    # corresponding to that histogram and stacks them
    for i in range(len(parent_shape)):
        dp_queries_comb.append(
            stackNodeProperties(children,
                                lambda node: node.querySets2Stack()[i],
                                cons_dpq.StackedDPquery))
    # TODO: Note that multipass rounder queries only support the main histogram currently (hence no loop below).
    #  May be necessary for full-scale DHCH to expand this to support the full histogram
    rounder_queries_comb = [
        stackNodeProperties(children, lambda node: node.rounder_queries,
                            cons_dpq.StackedQuery)
    ]

    opt_dict = {
        "Cons":
        stackNodeProperties(children, lambda node: node.opt_dict["Cons"],
                            cons_dpq.StackedConstraint),
        "npass_info":
        children[0].opt_dict["npass_info"],
    } if children[0].opt_dict is not None else None

    sequential_optimizers_dict = {
        CC.L2_PLUS_ROUNDER_WITH_BACKUP:
        sequential_optimizers.L2PlusRounderWithBackup,
        CC.L2_PLUS_ROUNDER_WITH_BACKUP_INTERLEAVED:
        sequential_optimizers.L2PlusRounderWithBackup_interleaved,
    }

    seq_opt_name, l2_opt, rounder = optimizers
    seq_opt_cls = sequential_optimizers_dict[seq_opt_name]

    try:
        l2c2o = children[0].query_ordering[CC.L2_CONSTRAIN_TO_QUERY_ORDERING]
    except KeyError:
        l2c2o = None

    # Create an appropriate sequential optimizer object
    seq_opt = seq_opt_cls(
        identifier=parent_geocode,
        child_geolevel=children[0].geolevel,
        parent=parent_hist,
        parent_shape=parent_shape,
        childGeoLen=len(children),
        constraints=constraints_comb,
        NoisyChild=noisy_child,
        noisy_child_weights=noisy_child_weights,
        DPqueries=dp_queries_comb,
        rounder_queries=rounder_queries_comb,
        min_schema=(min_schema, False),
        child_groups=child_groups,
        opt_dict=opt_dict,
        L2_DPqueryOrdering=children[0].query_ordering[CC.L2_QUERY_ORDERING],
        L2_Constrain_to_Ordering=l2c2o,
        Rounder_DPqueryOrdering=children[0].query_ordering[
            CC.ROUNDER_QUERY_ORDERING],
        optimizers=(l2_opt, rounder),
        das=None,
        config=config)

    l2_answer, int_answer, backup_solve_status = seq_opt.run()

    # Slice off the combined child solution to make lists of ndarrays, with one element for each child
    int_answer_list = np_utils.sliceArray(int_answer[0])
    unit_int_answer_list = np_utils.sliceArray(int_answer[1])
    l2_answer_list = np_utils.sliceArray(l2_answer[0])

    for i, child in enumerate(children):
        child.syn = int_answer_list[i]
        child.unit_syn = unit_int_answer_list[i]
    constraintsCheck(children)

    # Convert to sparse arrays for efficiency
    for i, child in enumerate(children):
        child.syn = sparse.multiSparse(int_answer_list[i])
        child.unit_syn = sparse.multiSparse(unit_int_answer_list[i])
        if keep_debug_info:
            child.syn_unrounded = sparse.multiSparse(l2_answer_list[i])
        else:
            child.dp_queries.clear()
    return children, backup_solve_status