def integerize_final_seed_weights(settings, crosswalk, control_spec, incidence_table):

    crosswalk_df = crosswalk.to_frame()
    incidence_df = incidence_table.to_frame()
    control_spec = control_spec.to_frame()

    seed_geography = settings.get('seed_geography')
    seed_controls_df = get_control_table(seed_geography)

    seed_weights_df = get_weight_table(seed_geography)

    # FIXME - I assume we want to integerize using meta controls too?
    control_cols = control_spec.target
    assert (seed_controls_df.columns == control_cols).all()

    # determine master_control_index if specified in settings
    total_hh_control_col = settings.get('total_hh_control')

    # run balancer for each seed geography
    weight_list = []

    seed_ids = crosswalk_df[seed_geography].unique()
    for seed_id in seed_ids:

        logger.info("integerize_final_seed_weights seed id %s" % seed_id)

        # slice incidence rows for this seed geography
        seed_incidence = incidence_df[incidence_df[seed_geography] == seed_id]

        balanced_seed_weights = \
            seed_weights_df.loc[seed_weights_df[seed_geography] == seed_id, 'balanced_weight']

        trace_label = "%s_%s" % (seed_geography, seed_id)

        integer_weights, status = do_integerizing(
            trace_label=trace_label,
            control_spec=control_spec,
            control_totals=seed_controls_df.loc[seed_id],
            incidence_table=seed_incidence[control_cols],
            float_weights=balanced_seed_weights,
            total_hh_control_col=total_hh_control_col
        )

        weight_list.append(integer_weights)

    # bulk concat all seed level results
    integer_seed_weights = pd.concat(weight_list)

    inject.add_column(weight_table_name(seed_geography), 'integer_weight', integer_seed_weights)
Ejemplo n.º 2
0
def final_seed_balancing(settings, crosswalk, control_spec, incidence_table):

    crosswalk_df = crosswalk.to_frame()
    incidence_df = incidence_table.to_frame()
    control_spec = control_spec.to_frame()

    seed_geography = settings.get('seed_geography')
    seed_weight_table_name = weight_table_name(seed_geography)

    # if there are no meta controls, then balanced_weight is simply preliminary_balanced_weight
    geographies = settings['geographies']
    if not (control_spec.geography == geographies[0]).any():
        logger.warning(
            "no need for final_seed_balancing because no meta controls")
        seed_weights_df = get_weight_table(seed_geography)
        if 'balanced_weight' not in seed_weights_df:
            final_seed_weights = seed_weights_df['preliminary_balanced_weight']
            inject.add_column(seed_weight_table_name, 'balanced_weight',
                              final_seed_weights)
        return

    # we use all control_spec rows, so no need to filter on geography as for initial_seed_balancing
    seed_controls_df = get_control_table(seed_geography)
    assert (seed_controls_df.columns == control_spec.target).all()

    # determine master_control_index if specified in settings
    total_hh_control_col = settings.get('total_hh_control')

    max_expansion_factor = settings.get('max_expansion_factor', None)

    relaxation_factors = pd.DataFrame(index=seed_controls_df.columns.tolist())

    # run balancer for each seed geography
    weight_list = []

    seed_ids = crosswalk_df[seed_geography].unique()
    for seed_id in seed_ids:

        logger.info("initial_seed_balancing seed id %s" % seed_id)

        seed_incidence_df = incidence_df[incidence_df[seed_geography] ==
                                         seed_id]

        status, weights_df, controls_df = do_balancing(
            control_spec=control_spec,
            total_hh_control_col=total_hh_control_col,
            max_expansion_factor=max_expansion_factor,
            incidence_df=seed_incidence_df,
            control_totals=seed_controls_df.loc[seed_id],
            initial_weights=seed_incidence_df['sample_weight'])

        logger.info("seed_balancer status: %s" % status)
        if not status['converged']:
            raise RuntimeError(
                "final_seed_balancing for seed_id %s did not converge" %
                seed_id)

        weight_list.append(weights_df['final'])

        relaxation_factors[seed_id] = controls_df['relaxation_factor']

    # bulk concat all seed level results
    final_seed_weights = pd.concat(weight_list)

    inject.add_column(seed_weight_table_name, 'balanced_weight',
                      final_seed_weights)
Ejemplo n.º 3
0
def sub_balancing(settings, crosswalk, control_spec, incidence_table):
    """
    Simul-balance and integerize all zones at a specified geographic level
    in groups by parent zone.

    For instance, if the 'geography' step arg is 'TRACT' and the parent geography is 'SEED',
    then for each seed zone, we simul-balance the TRACTS it contains.

    Creates a weight table for the target geography
    with float 'balanced_weight' and 'integer_weight' columns.

    Parameters
    ----------
    settings : dict (settings.yaml as dict)
    crosswalk : pipeline table
    control_spec : pipeline table
    incidence_table : pipeline table

    Returns
    -------

    """

    # geography is an injected model step arg
    geography = inject.get_step_arg('geography')

    crosswalk_df = crosswalk.to_frame()
    incidence_df = incidence_table.to_frame()
    control_spec = control_spec.to_frame()

    geographies = settings.get('geographies')
    seed_geography = settings.get('seed_geography')
    meta_geography = geographies[0]
    parent_geography = geographies[geographies.index(geography) - 1]

    sub_geographies = geographies[geographies.index(geography):]
    parent_geographies = geographies[:geographies.index(geography)]

    total_hh_control_col = setting('total_hh_control')

    parent_controls_df = get_control_table(parent_geography)
    sub_controls_df = get_control_table(geography)

    weights_df = get_weight_table(parent_geography)
    assert weights_df is not None

    integer_weights_list = []

    # the incidence table is siloed by seed geography, se we handle each seed zone in turn
    seed_ids = crosswalk_df[seed_geography].unique()
    for seed_id in seed_ids:

        # slice incidence and crosswalk tables for this seed zone
        seed_incidence_df = incidence_df[incidence_df[seed_geography] ==
                                         seed_id]
        seed_crosswalk_df = crosswalk_df[crosswalk_df[seed_geography] ==
                                         seed_id]

        assert len(seed_crosswalk_df[meta_geography].unique()) == 1

        # list of unique parent zone ids in this seed zone
        # (there will be just one if parent geo is seed)
        parent_ids = seed_crosswalk_df[parent_geography].unique()
        # only want ones for which there are (non-zero) controls
        parent_ids = parent_controls_df.index.intersection(parent_ids)

        for parent_id in parent_ids:

            logger.info("balancing seed %s, %s %s" %
                        (seed_id, parent_geography, parent_id))

            initial_weights = weights_df[weights_df[parent_geography] ==
                                         parent_id]
            initial_weights = initial_weights.set_index(
                settings.get('household_id_col'))

            # using balanced_weight slows down simul and doesn't improve results
            # (float seeds means no zero-weight households to drop)
            if setting('SUB_BALANCE_WITH_FLOAT_SEED_WEIGHTS', True):
                initial_weights = initial_weights['balanced_weight']
            else:
                initial_weights = initial_weights['integer_weight']

            assert len(initial_weights.index) == len(seed_incidence_df.index)

            zone_weights_df = balance_and_integerize(
                incidence_df=seed_incidence_df,
                parent_weights=initial_weights,
                sub_controls_df=sub_controls_df,
                control_spec=control_spec,
                total_hh_control_col=total_hh_control_col,
                parent_geography=parent_geography,
                parent_id=parent_id,
                sub_geographies=sub_geographies,
                crosswalk_df=seed_crosswalk_df)

            # add higher level geography id columns to facilitate summaries
            parent_geography_ids = \
                crosswalk_df.loc[crosswalk_df[parent_geography] == parent_id, parent_geographies]\
                .max(axis=0)
            for z in parent_geography_ids.index:
                zone_weights_df[z] = parent_geography_ids[z]

            integer_weights_list.append(zone_weights_df)

    integer_weights_df = pd.concat(integer_weights_list)

    inject.add_table(weight_table_name(geography), integer_weights_df)
    inject.add_table(
        weight_table_name(geography, sparse=True),
        integer_weights_df[integer_weights_df['integer_weight'] > 0])

    if 'trace_geography' in settings and geography in settings[
            'trace_geography']:
        trace_geography_id = settings.get('trace_geography')[geography]
        df = integer_weights_df[integer_weights_df[geography] ==
                                trace_geography_id]
        inject.add_table('trace_%s' % weight_table_name(geography), df)
def initial_seed_balancing(settings, crosswalk, control_spec, incidence_table):
    """
    Balance the household weights for each of the seed geographies (independently)
    using the seed level controls and the aggregated sub-zone controls totals.

    Create the seed_weights table with one row per household and columns contaiing
    household_id, seed geography (e.g. PUMA), and float preliminary_balanced_weights

    Adds seed_weights table to pipeline named <seed_geography>_weights (e.g. PUMA_weights):

    +--------+------+-----------------------------+-------+
    | index  | PUMA | preliminary_balanced_weight | hh_id |
    | hh_id  |      |                             |       |
    +========+======+=============================+=======+
    | 0      | 600  |                   0.313555  |    0  |
    | 1      | 601  |                   0.627110  |    1  |
    | 2      | 602  |                   0.313555  |    2  |
    | ...    |      |                             |       |
    +--------+------+-----------------------------+-------+

    Parameters
    ----------
    settings : dict (settings.yaml as dict)
    crosswalk : pipeline table
    control_spec : pipeline table
    incidence_table : pipeline table

    """
    crosswalk_df = crosswalk.to_frame()
    incidence_df = incidence_table.to_frame()
    control_spec = control_spec.to_frame()

    seed_geography = settings.get('seed_geography')
    seed_controls_df = get_control_table(seed_geography)

    # only want control_spec rows for sub_geographies
    geographies = settings['geographies']
    sub_geographies = geographies[geographies.index(seed_geography) + 1:]
    seed_control_spec = control_spec[control_spec['geography'].isin(
        sub_geographies)]

    # determine master_control_index if specified in settings
    total_hh_control_col = settings.get('total_hh_control')

    max_expansion_factor = settings.get('max_expansion_factor', None)

    # run balancer for each seed geography
    weight_list = []

    seed_ids = crosswalk_df[seed_geography].unique()
    for seed_id in seed_ids:

        logger.info("initial_seed_balancing seed id %s" % seed_id)

        seed_incidence_df = incidence_df[incidence_df[seed_geography] ==
                                         seed_id]

        status, weights_df, controls_df = do_balancing(
            control_spec=seed_control_spec,
            total_hh_control_col=total_hh_control_col,
            max_expansion_factor=max_expansion_factor,
            incidence_df=seed_incidence_df,
            control_totals=seed_controls_df.loc[seed_id],
            initial_weights=seed_incidence_df['sample_weight'])

        logger.info("seed_balancer status: %s" % status)
        if not status['converged']:
            raise RuntimeError(
                "initial_seed_balancing for seed_id %s did not converge" %
                seed_id)

        balanced_weights = weights_df['final']

        logger.info("Total balanced weights for seed %s = %s" %
                    (seed_id, balanced_weights.sum()))

        weight_list.append(balanced_weights)

    # bulk concat all seed level results
    weights = pd.concat(weight_list)

    # build canonical weights table
    seed_weights_df = incidence_df[[seed_geography]].copy()
    seed_weights_df['preliminary_balanced_weight'] = weights

    # copy household_id_col index to named column
    seed_weights_df[setting('household_id_col')] = seed_weights_df.index

    # this is just a convenience if there are no meta controls
    if inject.get_step_arg('final', default=False):
        seed_weights_df['balanced_weight'] = seed_weights_df[
            'preliminary_balanced_weight']

    inject.add_table(weight_table_name(seed_geography), seed_weights_df)
def integerize_final_seed_weights(settings, crosswalk, control_spec,
                                  incidence_table):
    """
    Final balancing for each seed (puma) zone with aggregated low and mid-level controls and
    distributed meta-level controls.

    Adds integer_weight column to seed-level weight table

    Parameters
    ----------
    settings : dict (settings.yaml as dict)
    crosswalk : pipeline table
    control_spec : pipeline table
    incidence_table : pipeline table

    Returns
    -------

    """

    crosswalk_df = crosswalk.to_frame()
    incidence_df = incidence_table.to_frame()
    control_spec = control_spec.to_frame()

    seed_geography = settings.get('seed_geography')
    seed_controls_df = get_control_table(seed_geography)

    seed_weights_df = get_weight_table(seed_geography)

    # FIXME - I assume we want to integerize using meta controls too?
    control_cols = control_spec.target
    assert (seed_controls_df.columns == control_cols).all()

    # determine master_control_index if specified in settings
    total_hh_control_col = settings.get('total_hh_control')

    # run balancer for each seed geography
    weight_list = []

    seed_ids = crosswalk_df[seed_geography].unique()
    for seed_id in seed_ids:

        logger.info("integerize_final_seed_weights seed id %s" % seed_id)

        # slice incidence rows for this seed geography
        seed_incidence = incidence_df[incidence_df[seed_geography] == seed_id]

        balanced_seed_weights = \
            seed_weights_df.loc[seed_weights_df[seed_geography] == seed_id, 'balanced_weight']

        trace_label = "%s_%s" % (seed_geography, seed_id)

        integer_weights, status = do_integerizing(
            trace_label=trace_label,
            control_spec=control_spec,
            control_totals=seed_controls_df.loc[seed_id],
            incidence_table=seed_incidence[control_cols],
            float_weights=balanced_seed_weights,
            total_hh_control_col=total_hh_control_col)

        weight_list.append(integer_weights)

    # bulk concat all seed level results
    integer_seed_weights = pd.concat(weight_list)

    inject.add_column(weight_table_name(seed_geography), 'integer_weight',
                      integer_seed_weights)
Ejemplo n.º 6
0
def sub_balancing(settings, crosswalk, control_spec, incidence_table):

    # geography is an injected model step arg
    geography = inject.get_step_arg('geography')

    crosswalk_df = crosswalk.to_frame()
    incidence_df = incidence_table.to_frame()
    control_spec = control_spec.to_frame()

    geographies = settings.get('geographies')
    seed_geography = settings.get('seed_geography')
    meta_geography = geographies[0]
    parent_geography = geographies[geographies.index(geography) - 1]

    sub_geographies = geographies[geographies.index(geography):]
    parent_geographies = geographies[:geographies.index(geography)]

    total_hh_control_col = settings.get('total_hh_control')

    sub_controls_df = get_control_table(geography)

    weights_df = get_weight_table(parent_geography)
    assert weights_df is not None

    integer_weights_list = []

    seed_ids = crosswalk_df[seed_geography].unique()
    for seed_id in seed_ids:

        seed_incidence_df = incidence_df[incidence_df[seed_geography] ==
                                         seed_id]
        seed_crosswalk_df = crosswalk_df[crosswalk_df[seed_geography] ==
                                         seed_id]

        assert len(seed_crosswalk_df[meta_geography].unique()) == 1

        parent_ids = seed_crosswalk_df[parent_geography].unique()

        for parent_id in parent_ids:

            logger.info("balancing seed %s, %s %s" %
                        (seed_id, parent_geography, parent_id))

            initial_weights = weights_df[weights_df[parent_geography] ==
                                         parent_id]
            initial_weights = initial_weights.set_index(
                settings.get('household_id_col'))

            # using balanced_weight slows down simul and doesn't improve results
            # (float seeds means no zero-weight households to drop)
            if setting('SUB_BALANCE_WITH_FLOAT_SEED_WEIGHTS', True):
                initial_weights = initial_weights['balanced_weight']
            else:
                initial_weights = initial_weights['integer_weight']

            assert len(initial_weights.index) == len(seed_incidence_df.index)

            zone_weights_df = balance_and_integerize(
                incidence_df=seed_incidence_df,
                parent_weights=initial_weights,
                sub_controls_df=sub_controls_df,
                control_spec=control_spec,
                total_hh_control_col=total_hh_control_col,
                parent_geography=parent_geography,
                parent_id=parent_id,
                sub_geographies=sub_geographies,
                crosswalk_df=seed_crosswalk_df)

            # add higher level geography id columns to facilitate summaries
            parent_geography_ids = \
                crosswalk_df.loc[crosswalk_df[parent_geography] == parent_id, parent_geographies]\
                .max(axis=0)
            for z in parent_geography_ids.index:
                zone_weights_df[z] = parent_geography_ids[z]

            integer_weights_list.append(zone_weights_df)

    integer_weights_df = pd.concat(integer_weights_list)

    inject.add_table(weight_table_name(geography), integer_weights_df)
    inject.add_table(
        weight_table_name(geography, sparse=True),
        integer_weights_df[integer_weights_df['integer_weight'] > 0])

    if 'trace_geography' in settings and geography in settings[
            'trace_geography']:
        trace_geography_id = settings.get('trace_geography')[geography]
        df = integer_weights_df[integer_weights_df[geography] ==
                                trace_geography_id]
        inject.add_table('trace_%s' % weight_table_name(geography), df)
Ejemplo n.º 7
0
def repop_balancing(settings, crosswalk, control_spec, incidence_table):

    crosswalk_df = crosswalk.to_frame()
    incidence_df = incidence_table.to_frame()
    control_spec = control_spec.to_frame()

    geographies = settings['geographies']
    low_geography = geographies[-1]

    seed_geography = settings.get('seed_geography')
    seed_controls_df = get_control_table(seed_geography)

    all_seed_weights_df = get_weight_table(seed_geography)
    assert all_seed_weights_df is not None

    # only want control_spec rows for low_geography
    low_control_spec = control_spec[control_spec['geography'] == low_geography]
    low_controls_df = get_control_table(low_geography)

    household_id_col = setting('household_id_col')
    total_hh_control_col = setting('total_hh_control')

    max_expansion_factor = settings.get('max_expansion_factor', None)

    # run balancer for each low geography
    low_weight_list = []

    seed_ids = crosswalk_df[seed_geography].unique()
    for seed_id in seed_ids:

        logger.info("initial_seed_balancing seed id %s" % seed_id)

        seed_incidence_df = incidence_df[incidence_df[seed_geography] ==
                                         seed_id]
        seed_crosswalk_df = crosswalk_df[crosswalk_df[seed_geography] ==
                                         seed_id]

        # initial seed weights in series indexed by hh id
        seed_weights_df = all_seed_weights_df[
            all_seed_weights_df[seed_geography] == seed_id]
        seed_weights_df = seed_weights_df.set_index(household_id_col)

        # number of hh in seed zone (for scaling low zone weights)
        seed_zone_hh_count = seed_controls_df[total_hh_control_col].loc[
            seed_id]

        low_ids = seed_crosswalk_df[low_geography].unique()
        for low_id in low_ids:

            trace_label = "%s_%s_%s_%s" % (seed_geography, seed_id,
                                           low_geography, low_id)
            logger.info("balance and integerize %s" % trace_label)

            # weights table for this zone with household_id index and low_geography column
            zone_weights_df = pd.DataFrame(index=seed_weights_df.index)
            zone_weights_df[low_geography] = low_id

            # scale seed weights by relative hh counts
            # it doesn't makes sense to repop balance with integer weights
            low_zone_hh_count = low_controls_df[total_hh_control_col].loc[
                low_id]
            scaling_factor = float(low_zone_hh_count) / seed_zone_hh_count
            initial_weights = seed_weights_df[
                'balanced_weight'] * scaling_factor

            # - balance
            status, weights_df, controls_df = do_balancing(
                control_spec=low_control_spec,
                total_hh_control_col=total_hh_control_col,
                max_expansion_factor=max_expansion_factor,
                incidence_df=seed_incidence_df,
                control_totals=low_controls_df.loc[low_id],
                initial_weights=initial_weights)

            logger.info("repop_balancing balancing %s status: %s" %
                        (trace_label, status))
            if not status['converged']:
                raise RuntimeError("repop_balancing for %s did not converge" %
                                   trace_label)

            zone_weights_df['balanced_weight'] = weights_df['final']

            # - integerize
            integer_weights, status = do_integerizing(
                trace_label=trace_label,
                control_spec=control_spec,
                control_totals=low_controls_df.loc[low_id],
                incidence_table=seed_incidence_df,
                float_weights=weights_df['final'],
                total_hh_control_col=total_hh_control_col)

            logger.info("repop_balancing integerizing status: %s" % status)

            zone_weights_df['integer_weight'] = integer_weights

            logger.info(
                "Total balanced weights for %s = %s" %
                (trace_label, zone_weights_df['balanced_weight'].sum()))
            logger.info("Total integerized weights for %s = %s" %
                        (trace_label, zone_weights_df['integer_weight'].sum()))

            low_weight_list.append(zone_weights_df)

    # concat all low geography zone level results
    low_weights_df = pd.concat(low_weight_list).reset_index()

    # add higher level geography id columns to facilitate summaries
    crosswalk_df = crosswalk_df.set_index(low_geography)\
        .loc[low_weights_df[low_geography]]\
        .reset_index(drop=True)
    low_weights_df = pd.concat([low_weights_df, crosswalk_df], axis=1)

    inject.add_table(weight_table_name(low_geography), low_weights_df)
    inject.add_table(weight_table_name(low_geography, sparse=True),
                     low_weights_df[low_weights_df['integer_weight'] > 0])
Ejemplo n.º 8
0
def initial_seed_balancing(settings, crosswalk, control_spec, incidence_table):

    crosswalk_df = crosswalk.to_frame()
    incidence_df = incidence_table.to_frame()
    control_spec = control_spec.to_frame()

    seed_geography = settings.get('seed_geography')
    seed_controls_df = get_control_table(seed_geography)

    # only want control_spec rows for sub_geographies
    geographies = settings['geographies']
    sub_geographies = geographies[geographies.index(seed_geography) + 1:]
    seed_control_spec = control_spec[control_spec['geography'].isin(
        sub_geographies)]

    # determine master_control_index if specified in settings
    total_hh_control_col = settings.get('total_hh_control')

    max_expansion_factor = settings.get('max_expansion_factor', None)

    # run balancer for each seed geography
    weight_list = []

    seed_ids = crosswalk_df[seed_geography].unique()
    for seed_id in seed_ids:

        logger.info("initial_seed_balancing seed id %s" % seed_id)

        seed_incidence_df = incidence_df[incidence_df[seed_geography] ==
                                         seed_id]

        status, weights_df, controls_df = do_balancing(
            control_spec=seed_control_spec,
            total_hh_control_col=total_hh_control_col,
            max_expansion_factor=max_expansion_factor,
            incidence_df=seed_incidence_df,
            control_totals=seed_controls_df.loc[seed_id],
            initial_weights=seed_incidence_df['sample_weight'])

        logger.info("seed_balancer status: %s" % status)
        if not status['converged']:
            raise RuntimeError(
                "initial_seed_balancing for seed_id %s did not converge" %
                seed_id)

        balanced_weights = weights_df['final']

        logger.info("Total balanced weights for seed %s = %s" %
                    (seed_id, balanced_weights.sum()))

        weight_list.append(balanced_weights)

    # bulk concat all seed level results
    weights = pd.concat(weight_list)

    # build canonical weights table
    seed_weights_df = incidence_df[[seed_geography]].copy()
    seed_weights_df['preliminary_balanced_weight'] = weights

    # copy household_id_col index to named column
    seed_weights_df[setting('household_id_col')] = seed_weights_df.index

    # this is just a convenience if there are no meta controls
    if inject.get_step_arg('final', default=False):
        seed_weights_df['balanced_weight'] = seed_weights_df[
            'preliminary_balanced_weight']

    inject.add_table(weight_table_name(seed_geography), seed_weights_df)