Python get_control_tableの例、helper.get_control_table Pythonの例

コード例 #1

0

ファイルを表示

ファイル: summarize.py プロジェクト: bellevuewa/populationsim

def summarize_geography(geography, weight_col, crosswalk_df, results_df,
                        incidence_df):

    # controls_table for current geography level
    controls_df = get_control_table(geography)
    control_names = controls_df.columns.tolist()

    # only want zones from crosswalk for which non-zero control rows exist
    zone_ids = crosswalk_df[geography].unique()
    zone_ids = controls_df.index.intersection(zone_ids)

    results = []
    controls = []
    for zone_id in zone_ids:

        zone_controls = controls_df.loc[zone_id].tolist()

        controls.append(zone_controls)

        zone_row_map = results_df[geography] == zone_id
        zone_weights = results_df[zone_row_map]

        incidence = incidence_df.loc[zone_weights.hh_id]

        weights = zone_weights[weight_col].tolist()
        x = [(incidence[c] * weights).sum() for c in control_names]
        results.append(x)

    controls_df = pd.DataFrame(
        data=np.asanyarray(controls),
        columns=['%s_control' % c for c in control_names],
        index=zone_ids)

    summary_df = pd.DataFrame(data=np.asanyarray(results),
                              columns=['%s_result' % c for c in control_names],
                              index=zone_ids)

    dif_df = pd.DataFrame(data=np.asanyarray(results) -
                          np.asanyarray(controls),
                          columns=['%s_diff' % c for c in control_names],
                          index=zone_ids)

    summary_df = pd.concat([controls_df, summary_df, dif_df], axis=1)

    summary_cols = summary_df.columns.tolist()

    summary_df['geography'] = geography
    summary_df['id'] = summary_df.index
    summary_df.index = summary_df['geography'] + '_' + summary_df['id'].astype(
        str)
    summary_df = summary_df[['geography', 'id'] + summary_cols]

    return summary_df

コード例 #2

0

ファイルを表示

def meta_summary(incidence_df, control_spec, top_geography, top_id,
                 sub_geographies):

    incidence_df = incidence_df[incidence_df[top_geography] == top_id]

    control_cols = control_spec.target.values

    controls_df = get_control_table(top_geography)

    # controls for this geography as series
    controls = controls_df[control_cols].loc[top_id]

    incidence = incidence_df[control_cols]

    summary = pd.DataFrame(index=control_cols)

    summary.index.name = 'control_name'

    summary['control_value'] = controls

    seed_geography = setting('seed_geography')
    seed_weights_df = get_weight_table(seed_geography)
    seed_weight_cols = [
        'preliminary_balanced_weight', 'balanced_weight', 'integer_weight'
    ]
    for c in seed_weight_cols:
        if c in seed_weights_df:
            summary_col_name = '%s_%s' % (top_geography, c)
            summary[summary_col_name] = \
                incidence.multiply(seed_weights_df[c], axis="index").sum(axis=0)

    for g in sub_geographies:

        sub_weight_cols = ['balanced_weight', 'integer_weight']

        sub_weights = get_weight_table(g)

        if sub_weights is None:
            continue

        sub_weights = sub_weights[sub_weights[top_geography] == top_id]

        sub_weights = sub_weights[[setting('household_id_col')] +
                                  sub_weight_cols].groupby(
                                      setting('household_id_col')).sum()

        for c in sub_weight_cols:
            summary['%s_%s' % (g, c)] = \
                incidence.multiply(sub_weights[c], axis="index").sum(axis=0)

    return summary

コード例 #3

0

ファイルを表示

ファイル: integerize_final_seed_weights.py プロジェクト: jfdman/populationsim

def integerize_final_seed_weights(settings, crosswalk, control_spec, incidence_table):

    crosswalk_df = crosswalk.to_frame()
    incidence_df = incidence_table.to_frame()
    control_spec = control_spec.to_frame()

    seed_geography = settings.get('seed_geography')
    seed_controls_df = get_control_table(seed_geography)

    seed_weights_df = get_weight_table(seed_geography)

    # FIXME - I assume we want to integerize using meta controls too?
    control_cols = control_spec.target
    assert (seed_controls_df.columns == control_cols).all()

    # determine master_control_index if specified in settings
    total_hh_control_col = settings.get('total_hh_control')

    # run balancer for each seed geography
    weight_list = []

    seed_ids = crosswalk_df[seed_geography].unique()
    for seed_id in seed_ids:

        logger.info("integerize_final_seed_weights seed id %s" % seed_id)

        # slice incidence rows for this seed geography
        seed_incidence = incidence_df[incidence_df[seed_geography] == seed_id]

        balanced_seed_weights = \
            seed_weights_df.loc[seed_weights_df[seed_geography] == seed_id, 'balanced_weight']

        trace_label = "%s_%s" % (seed_geography, seed_id)

        integer_weights, status = do_integerizing(
            trace_label=trace_label,
            control_spec=control_spec,
            control_totals=seed_controls_df.loc[seed_id],
            incidence_table=seed_incidence[control_cols],
            float_weights=balanced_seed_weights,
            total_hh_control_col=total_hh_control_col
        )

        weight_list.append(integer_weights)

    # bulk concat all seed level results
    integer_seed_weights = pd.concat(weight_list)

    inject.add_column(weight_table_name(seed_geography), 'integer_weight', integer_seed_weights)

コード例 #4

0

ファイルを表示

def final_seed_balancing(settings, crosswalk, control_spec, incidence_table):

    crosswalk_df = crosswalk.to_frame()
    incidence_df = incidence_table.to_frame()
    control_spec = control_spec.to_frame()

    seed_geography = settings.get('seed_geography')
    seed_weight_table_name = weight_table_name(seed_geography)

    # if there are no meta controls, then balanced_weight is simply preliminary_balanced_weight
    geographies = settings['geographies']
    if not (control_spec.geography == geographies[0]).any():
        logger.warning(
            "no need for final_seed_balancing because no meta controls")
        seed_weights_df = get_weight_table(seed_geography)
        if 'balanced_weight' not in seed_weights_df:
            final_seed_weights = seed_weights_df['preliminary_balanced_weight']
            inject.add_column(seed_weight_table_name, 'balanced_weight',
                              final_seed_weights)
        return

    # we use all control_spec rows, so no need to filter on geography as for initial_seed_balancing
    seed_controls_df = get_control_table(seed_geography)
    assert (seed_controls_df.columns == control_spec.target).all()

    # determine master_control_index if specified in settings
    total_hh_control_col = settings.get('total_hh_control')

    max_expansion_factor = settings.get('max_expansion_factor', None)

    relaxation_factors = pd.DataFrame(index=seed_controls_df.columns.tolist())

    # run balancer for each seed geography
    weight_list = []

    seed_ids = crosswalk_df[seed_geography].unique()
    for seed_id in seed_ids:

        logger.info("initial_seed_balancing seed id %s" % seed_id)

        seed_incidence_df = incidence_df[incidence_df[seed_geography] ==
                                         seed_id]

        status, weights_df, controls_df = do_balancing(
            control_spec=control_spec,
            total_hh_control_col=total_hh_control_col,
            max_expansion_factor=max_expansion_factor,
            incidence_df=seed_incidence_df,
            control_totals=seed_controls_df.loc[seed_id],
            initial_weights=seed_incidence_df['sample_weight'])

        logger.info("seed_balancer status: %s" % status)
        if not status['converged']:
            raise RuntimeError(
                "final_seed_balancing for seed_id %s did not converge" %
                seed_id)

        weight_list.append(weights_df['final'])

        relaxation_factors[seed_id] = controls_df['relaxation_factor']

    # bulk concat all seed level results
    final_seed_weights = pd.concat(weight_list)

    inject.add_column(seed_weight_table_name, 'balanced_weight',
                      final_seed_weights)

コード例 #5

0

ファイルを表示

def sub_balancing(settings, crosswalk, control_spec, incidence_table):
    """
    Simul-balance and integerize all zones at a specified geographic level
    in groups by parent zone.

    For instance, if the 'geography' step arg is 'TRACT' and the parent geography is 'SEED',
    then for each seed zone, we simul-balance the TRACTS it contains.

    Creates a weight table for the target geography
    with float 'balanced_weight' and 'integer_weight' columns.

    Parameters
    ----------
    settings : dict (settings.yaml as dict)
    crosswalk : pipeline table
    control_spec : pipeline table
    incidence_table : pipeline table

    Returns
    -------

    """

    # geography is an injected model step arg
    geography = inject.get_step_arg('geography')

    crosswalk_df = crosswalk.to_frame()
    incidence_df = incidence_table.to_frame()
    control_spec = control_spec.to_frame()

    geographies = settings.get('geographies')
    seed_geography = settings.get('seed_geography')
    meta_geography = geographies[0]
    parent_geography = geographies[geographies.index(geography) - 1]

    sub_geographies = geographies[geographies.index(geography):]
    parent_geographies = geographies[:geographies.index(geography)]

    total_hh_control_col = setting('total_hh_control')

    parent_controls_df = get_control_table(parent_geography)
    sub_controls_df = get_control_table(geography)

    weights_df = get_weight_table(parent_geography)
    assert weights_df is not None

    integer_weights_list = []

    # the incidence table is siloed by seed geography, se we handle each seed zone in turn
    seed_ids = crosswalk_df[seed_geography].unique()
    for seed_id in seed_ids:

        # slice incidence and crosswalk tables for this seed zone
        seed_incidence_df = incidence_df[incidence_df[seed_geography] ==
                                         seed_id]
        seed_crosswalk_df = crosswalk_df[crosswalk_df[seed_geography] ==
                                         seed_id]

        assert len(seed_crosswalk_df[meta_geography].unique()) == 1

        # list of unique parent zone ids in this seed zone
        # (there will be just one if parent geo is seed)
        parent_ids = seed_crosswalk_df[parent_geography].unique()
        # only want ones for which there are (non-zero) controls
        parent_ids = parent_controls_df.index.intersection(parent_ids)

        for parent_id in parent_ids:

            logger.info("balancing seed %s, %s %s" %
                        (seed_id, parent_geography, parent_id))

            initial_weights = weights_df[weights_df[parent_geography] ==
                                         parent_id]
            initial_weights = initial_weights.set_index(
                settings.get('household_id_col'))

            # using balanced_weight slows down simul and doesn't improve results
            # (float seeds means no zero-weight households to drop)
            if setting('SUB_BALANCE_WITH_FLOAT_SEED_WEIGHTS', True):
                initial_weights = initial_weights['balanced_weight']
            else:
                initial_weights = initial_weights['integer_weight']

            assert len(initial_weights.index) == len(seed_incidence_df.index)

            zone_weights_df = balance_and_integerize(
                incidence_df=seed_incidence_df,
                parent_weights=initial_weights,
                sub_controls_df=sub_controls_df,
                control_spec=control_spec,
                total_hh_control_col=total_hh_control_col,
                parent_geography=parent_geography,
                parent_id=parent_id,
                sub_geographies=sub_geographies,
                crosswalk_df=seed_crosswalk_df)

            # add higher level geography id columns to facilitate summaries
            parent_geography_ids = \
                crosswalk_df.loc[crosswalk_df[parent_geography] == parent_id, parent_geographies]\
                .max(axis=0)
            for z in parent_geography_ids.index:
                zone_weights_df[z] = parent_geography_ids[z]

            integer_weights_list.append(zone_weights_df)

    integer_weights_df = pd.concat(integer_weights_list)

    inject.add_table(weight_table_name(geography), integer_weights_df)
    inject.add_table(
        weight_table_name(geography, sparse=True),
        integer_weights_df[integer_weights_df['integer_weight'] > 0])

    if 'trace_geography' in settings and geography in settings[
            'trace_geography']:
        trace_geography_id = settings.get('trace_geography')[geography]
        df = integer_weights_df[integer_weights_df[geography] ==
                                trace_geography_id]
        inject.add_table('trace_%s' % weight_table_name(geography), df)

コード例 #6

0

ファイルを表示

ファイル: initial_seed_balancing.py プロジェクト: bettinardi/populationsim

def initial_seed_balancing(settings, crosswalk, control_spec, incidence_table):
    """
    Balance the household weights for each of the seed geographies (independently)
    using the seed level controls and the aggregated sub-zone controls totals.

    Create the seed_weights table with one row per household and columns contaiing
    household_id, seed geography (e.g. PUMA), and float preliminary_balanced_weights

    Adds seed_weights table to pipeline named <seed_geography>_weights (e.g. PUMA_weights):

    +--------+------+-----------------------------+-------+
    | index  | PUMA | preliminary_balanced_weight | hh_id |
    | hh_id  |      |                             |       |
    +========+======+=============================+=======+
    | 0      | 600  |                   0.313555  |    0  |
    | 1      | 601  |                   0.627110  |    1  |
    | 2      | 602  |                   0.313555  |    2  |
    | ...    |      |                             |       |
    +--------+------+-----------------------------+-------+

    Parameters
    ----------
    settings : dict (settings.yaml as dict)
    crosswalk : pipeline table
    control_spec : pipeline table
    incidence_table : pipeline table

    """
    crosswalk_df = crosswalk.to_frame()
    incidence_df = incidence_table.to_frame()
    control_spec = control_spec.to_frame()

    seed_geography = settings.get('seed_geography')
    seed_controls_df = get_control_table(seed_geography)

    # only want control_spec rows for sub_geographies
    geographies = settings['geographies']
    sub_geographies = geographies[geographies.index(seed_geography) + 1:]
    seed_control_spec = control_spec[control_spec['geography'].isin(
        sub_geographies)]

    # determine master_control_index if specified in settings
    total_hh_control_col = settings.get('total_hh_control')

    max_expansion_factor = settings.get('max_expansion_factor', None)

    # run balancer for each seed geography
    weight_list = []

    seed_ids = crosswalk_df[seed_geography].unique()
    for seed_id in seed_ids:

        logger.info("initial_seed_balancing seed id %s" % seed_id)

        seed_incidence_df = incidence_df[incidence_df[seed_geography] ==
                                         seed_id]

        status, weights_df, controls_df = do_balancing(
            control_spec=seed_control_spec,
            total_hh_control_col=total_hh_control_col,
            max_expansion_factor=max_expansion_factor,
            incidence_df=seed_incidence_df,
            control_totals=seed_controls_df.loc[seed_id],
            initial_weights=seed_incidence_df['sample_weight'])

        logger.info("seed_balancer status: %s" % status)
        if not status['converged']:
            raise RuntimeError(
                "initial_seed_balancing for seed_id %s did not converge" %
                seed_id)

        balanced_weights = weights_df['final']

        logger.info("Total balanced weights for seed %s = %s" %
                    (seed_id, balanced_weights.sum()))

        weight_list.append(balanced_weights)

    # bulk concat all seed level results
    weights = pd.concat(weight_list)

    # build canonical weights table
    seed_weights_df = incidence_df[[seed_geography]].copy()
    seed_weights_df['preliminary_balanced_weight'] = weights

    # copy household_id_col index to named column
    seed_weights_df[setting('household_id_col')] = seed_weights_df.index

    # this is just a convenience if there are no meta controls
    if inject.get_step_arg('final', default=False):
        seed_weights_df['balanced_weight'] = seed_weights_df[
            'preliminary_balanced_weight']

    inject.add_table(weight_table_name(seed_geography), seed_weights_df)

コード例 #7

0

ファイルを表示

ファイル: integerize_final_seed_weights.py プロジェクト: bettinardi/populationsim

def integerize_final_seed_weights(settings, crosswalk, control_spec,
                                  incidence_table):
    """
    Final balancing for each seed (puma) zone with aggregated low and mid-level controls and
    distributed meta-level controls.

    Adds integer_weight column to seed-level weight table

    Parameters
    ----------
    settings : dict (settings.yaml as dict)
    crosswalk : pipeline table
    control_spec : pipeline table
    incidence_table : pipeline table

    Returns
    -------

    """

    crosswalk_df = crosswalk.to_frame()
    incidence_df = incidence_table.to_frame()
    control_spec = control_spec.to_frame()

    seed_geography = settings.get('seed_geography')
    seed_controls_df = get_control_table(seed_geography)

    seed_weights_df = get_weight_table(seed_geography)

    # FIXME - I assume we want to integerize using meta controls too?
    control_cols = control_spec.target
    assert (seed_controls_df.columns == control_cols).all()

    # determine master_control_index if specified in settings
    total_hh_control_col = settings.get('total_hh_control')

    # run balancer for each seed geography
    weight_list = []

    seed_ids = crosswalk_df[seed_geography].unique()
    for seed_id in seed_ids:

        logger.info("integerize_final_seed_weights seed id %s" % seed_id)

        # slice incidence rows for this seed geography
        seed_incidence = incidence_df[incidence_df[seed_geography] == seed_id]

        balanced_seed_weights = \
            seed_weights_df.loc[seed_weights_df[seed_geography] == seed_id, 'balanced_weight']

        trace_label = "%s_%s" % (seed_geography, seed_id)

        integer_weights, status = do_integerizing(
            trace_label=trace_label,
            control_spec=control_spec,
            control_totals=seed_controls_df.loc[seed_id],
            incidence_table=seed_incidence[control_cols],
            float_weights=balanced_seed_weights,
            total_hh_control_col=total_hh_control_col)

        weight_list.append(integer_weights)

    # bulk concat all seed level results
    integer_seed_weights = pd.concat(weight_list)

    inject.add_column(weight_table_name(seed_geography), 'integer_weight',
                      integer_seed_weights)

コード例 #8

0

ファイルを表示

def sub_balancing(settings, crosswalk, control_spec, incidence_table):

    # geography is an injected model step arg
    geography = inject.get_step_arg('geography')

    crosswalk_df = crosswalk.to_frame()
    incidence_df = incidence_table.to_frame()
    control_spec = control_spec.to_frame()

    geographies = settings.get('geographies')
    seed_geography = settings.get('seed_geography')
    meta_geography = geographies[0]
    parent_geography = geographies[geographies.index(geography) - 1]

    sub_geographies = geographies[geographies.index(geography):]
    parent_geographies = geographies[:geographies.index(geography)]

    total_hh_control_col = settings.get('total_hh_control')

    sub_controls_df = get_control_table(geography)

    weights_df = get_weight_table(parent_geography)
    assert weights_df is not None

    integer_weights_list = []

    seed_ids = crosswalk_df[seed_geography].unique()
    for seed_id in seed_ids:

        seed_incidence_df = incidence_df[incidence_df[seed_geography] ==
                                         seed_id]
        seed_crosswalk_df = crosswalk_df[crosswalk_df[seed_geography] ==
                                         seed_id]

        assert len(seed_crosswalk_df[meta_geography].unique()) == 1

        parent_ids = seed_crosswalk_df[parent_geography].unique()

        for parent_id in parent_ids:

            logger.info("balancing seed %s, %s %s" %
                        (seed_id, parent_geography, parent_id))

            initial_weights = weights_df[weights_df[parent_geography] ==
                                         parent_id]
            initial_weights = initial_weights.set_index(
                settings.get('household_id_col'))

            # using balanced_weight slows down simul and doesn't improve results
            # (float seeds means no zero-weight households to drop)
            if setting('SUB_BALANCE_WITH_FLOAT_SEED_WEIGHTS', True):
                initial_weights = initial_weights['balanced_weight']
            else:
                initial_weights = initial_weights['integer_weight']

            assert len(initial_weights.index) == len(seed_incidence_df.index)

            zone_weights_df = balance_and_integerize(
                incidence_df=seed_incidence_df,
                parent_weights=initial_weights,
                sub_controls_df=sub_controls_df,
                control_spec=control_spec,
                total_hh_control_col=total_hh_control_col,
                parent_geography=parent_geography,
                parent_id=parent_id,
                sub_geographies=sub_geographies,
                crosswalk_df=seed_crosswalk_df)

            # add higher level geography id columns to facilitate summaries
            parent_geography_ids = \
                crosswalk_df.loc[crosswalk_df[parent_geography] == parent_id, parent_geographies]\
                .max(axis=0)
            for z in parent_geography_ids.index:
                zone_weights_df[z] = parent_geography_ids[z]

            integer_weights_list.append(zone_weights_df)

    integer_weights_df = pd.concat(integer_weights_list)

    inject.add_table(weight_table_name(geography), integer_weights_df)
    inject.add_table(
        weight_table_name(geography, sparse=True),
        integer_weights_df[integer_weights_df['integer_weight'] > 0])

    if 'trace_geography' in settings and geography in settings[
            'trace_geography']:
        trace_geography_id = settings.get('trace_geography')[geography]
        df = integer_weights_df[integer_weights_df[geography] ==
                                trace_geography_id]
        inject.add_table('trace_%s' % weight_table_name(geography), df)

コード例 #9

0

ファイルを表示

def repop_balancing(settings, crosswalk, control_spec, incidence_table):

    crosswalk_df = crosswalk.to_frame()
    incidence_df = incidence_table.to_frame()
    control_spec = control_spec.to_frame()

    geographies = settings['geographies']
    low_geography = geographies[-1]

    seed_geography = settings.get('seed_geography')
    seed_controls_df = get_control_table(seed_geography)

    all_seed_weights_df = get_weight_table(seed_geography)
    assert all_seed_weights_df is not None

    # only want control_spec rows for low_geography
    low_control_spec = control_spec[control_spec['geography'] == low_geography]
    low_controls_df = get_control_table(low_geography)

    household_id_col = setting('household_id_col')
    total_hh_control_col = setting('total_hh_control')

    max_expansion_factor = settings.get('max_expansion_factor', None)

    # run balancer for each low geography
    low_weight_list = []

    seed_ids = crosswalk_df[seed_geography].unique()
    for seed_id in seed_ids:

        logger.info("initial_seed_balancing seed id %s" % seed_id)

        seed_incidence_df = incidence_df[incidence_df[seed_geography] ==
                                         seed_id]
        seed_crosswalk_df = crosswalk_df[crosswalk_df[seed_geography] ==
                                         seed_id]

        # initial seed weights in series indexed by hh id
        seed_weights_df = all_seed_weights_df[
            all_seed_weights_df[seed_geography] == seed_id]
        seed_weights_df = seed_weights_df.set_index(household_id_col)

        # number of hh in seed zone (for scaling low zone weights)
        seed_zone_hh_count = seed_controls_df[total_hh_control_col].loc[
            seed_id]

        low_ids = seed_crosswalk_df[low_geography].unique()
        for low_id in low_ids:

            trace_label = "%s_%s_%s_%s" % (seed_geography, seed_id,
                                           low_geography, low_id)
            logger.info("balance and integerize %s" % trace_label)

            # weights table for this zone with household_id index and low_geography column
            zone_weights_df = pd.DataFrame(index=seed_weights_df.index)
            zone_weights_df[low_geography] = low_id

            # scale seed weights by relative hh counts
            # it doesn't makes sense to repop balance with integer weights
            low_zone_hh_count = low_controls_df[total_hh_control_col].loc[
                low_id]
            scaling_factor = float(low_zone_hh_count) / seed_zone_hh_count
            initial_weights = seed_weights_df[
                'balanced_weight'] * scaling_factor

            # - balance
            status, weights_df, controls_df = do_balancing(
                control_spec=low_control_spec,
                total_hh_control_col=total_hh_control_col,
                max_expansion_factor=max_expansion_factor,
                incidence_df=seed_incidence_df,
                control_totals=low_controls_df.loc[low_id],
                initial_weights=initial_weights)

            logger.info("repop_balancing balancing %s status: %s" %
                        (trace_label, status))
            if not status['converged']:
                raise RuntimeError("repop_balancing for %s did not converge" %
                                   trace_label)

            zone_weights_df['balanced_weight'] = weights_df['final']

            # - integerize
            integer_weights, status = do_integerizing(
                trace_label=trace_label,
                control_spec=control_spec,
                control_totals=low_controls_df.loc[low_id],
                incidence_table=seed_incidence_df,
                float_weights=weights_df['final'],
                total_hh_control_col=total_hh_control_col)

            logger.info("repop_balancing integerizing status: %s" % status)

            zone_weights_df['integer_weight'] = integer_weights

            logger.info(
                "Total balanced weights for %s = %s" %
                (trace_label, zone_weights_df['balanced_weight'].sum()))
            logger.info("Total integerized weights for %s = %s" %
                        (trace_label, zone_weights_df['integer_weight'].sum()))

            low_weight_list.append(zone_weights_df)

    # concat all low geography zone level results
    low_weights_df = pd.concat(low_weight_list).reset_index()

    # add higher level geography id columns to facilitate summaries
    crosswalk_df = crosswalk_df.set_index(low_geography)\
        .loc[low_weights_df[low_geography]]\
        .reset_index(drop=True)
    low_weights_df = pd.concat([low_weights_df, crosswalk_df], axis=1)

    inject.add_table(weight_table_name(low_geography), low_weights_df)
    inject.add_table(weight_table_name(low_geography, sparse=True),
                     low_weights_df[low_weights_df['integer_weight'] > 0])

コード例 #10

0

ファイルを表示

ファイル: meta_control_factoring.py プロジェクト: jfdman/populationsim

def meta_control_factoring(settings, control_spec, incidence_table):
    """
    Apply simple factoring to summed household fractional weights based on original
    meta control values relative to summed household fractional weights by meta zone.

    The resulting factored meta control weights will be new meta controls, to be
    appended to the original controls, for final balancing.

    Parameters
    ----------
    settings
    control_spec
    incidence_table

    Returns
    -------

    """

    # FIXME - if there is only one seed zone in the meta zone, just copy meta control values?

    incidence_df = incidence_table.to_frame()
    control_spec = control_spec.to_frame()

    geographies = settings.get('geographies')
    seed_geography = settings.get('seed_geography')
    meta_geography = geographies[0]

    # - if there are no meta controls, then we don't have to do anything
    if not (control_spec.geography == meta_geography).any():
        logger.warn("meta_control_factoring: no meta targets so nothing to do")
        return

    meta_controls_df = get_control_table(meta_geography)
    dump_table("meta_controls_df", meta_controls_df)

    # slice control_spec to select only the rows for meta level controls
    meta_controls_spec = control_spec[control_spec.geography == meta_geography]
    meta_control_targets = meta_controls_spec['target']

    logger.info("meta_control_factoring %s targets" % len(meta_control_targets))

    dump_table("meta_controls_spec", meta_controls_spec)
    dump_table("meta_control_targets", meta_control_targets)

    # seed level weights of all households (rows aligned with incidence_df rows)
    seed_weights_df = get_weight_table(seed_geography)
    assert len(incidence_df.index) == len(seed_weights_df.index)

    # expand person weights by incidence (incidnece will simply be 1 for household targets)
    hh_level_weights = incidence_df[[seed_geography, meta_geography]].copy()
    for target in meta_control_targets:
        hh_level_weights[target] = \
            incidence_df[target] * seed_weights_df['preliminary_balanced_weight']

    dump_table("hh_level_weights", hh_level_weights)

    # weights of meta targets at seed level
    factored_seed_weights = \
        hh_level_weights.groupby([seed_geography, meta_geography], as_index=False).sum()
    factored_seed_weights.set_index(seed_geography, inplace=True)
    dump_table("factored_seed_weights", factored_seed_weights)

    # weights of meta targets summed from seed level to  meta level
    factored_meta_weights = factored_seed_weights.groupby(meta_geography, as_index=True).sum()
    dump_table("factored_meta_weights", factored_meta_weights)

    # only the meta level controls from meta_controls table
    meta_controls_df = meta_controls_df[meta_control_targets]
    dump_table("meta_controls_df", meta_controls_df)

    # compute the scaling factors to be applied to the seed-level totals:
    meta_factors = pd.DataFrame(index=meta_controls_df.index)
    for target in meta_control_targets:
        meta_factors[target] = meta_controls_df[target] / factored_meta_weights[target]
    dump_table("meta_factors", meta_factors)

    # compute seed-level controls from meta-level controls
    seed_level_meta_controls = pd.DataFrame(index=factored_seed_weights.index)
    for target in meta_control_targets:
        #  meta level scaling_factor for this meta_control
        scaling_factor = factored_seed_weights[meta_geography].map(meta_factors[target])
        # scale the seed_level_meta_controls by meta_level scaling_factor
        seed_level_meta_controls[target] = factored_seed_weights[target] * scaling_factor
        # FIXME - why round scaled factored seed_weights to int prior to final seed balancing?
        seed_level_meta_controls[target] = seed_level_meta_controls[target].round().astype(int)
    dump_table("seed_level_meta_controls", seed_level_meta_controls)

    # create final balancing controls
    # add newly created seed_level_meta_controls to the existing set of seed level controls

    seed_controls_df = get_control_table(seed_geography)
    assert len(seed_controls_df.index) == len(seed_level_meta_controls.index)
    seed_controls_df = pd.concat([seed_controls_df, seed_level_meta_controls], axis=1)

    # ensure columns are in right order for orca-extended table
    seed_controls_df = seed_controls_df[control_spec.target]
    assert (seed_controls_df.columns == control_spec.target).all()

    dump_table("seed_controls_df", seed_controls_df)

    pipeline.replace_table(control_table_name(seed_geography), seed_controls_df)

コード例 #11

0

ファイルを表示

ファイル: initial_seed_balancing.py プロジェクト: jfdman/populationsim

def initial_seed_balancing(settings, crosswalk, control_spec, incidence_table):

    crosswalk_df = crosswalk.to_frame()
    incidence_df = incidence_table.to_frame()
    control_spec = control_spec.to_frame()

    seed_geography = settings.get('seed_geography')
    seed_controls_df = get_control_table(seed_geography)

    # only want control_spec rows for sub_geographies
    geographies = settings['geographies']
    sub_geographies = geographies[geographies.index(seed_geography) + 1:]
    seed_control_spec = control_spec[control_spec['geography'].isin(
        sub_geographies)]

    # determine master_control_index if specified in settings
    total_hh_control_col = settings.get('total_hh_control')

    max_expansion_factor = settings.get('max_expansion_factor', None)

    # run balancer for each seed geography
    weight_list = []

    seed_ids = crosswalk_df[seed_geography].unique()
    for seed_id in seed_ids:

        logger.info("initial_seed_balancing seed id %s" % seed_id)

        seed_incidence_df = incidence_df[incidence_df[seed_geography] ==
                                         seed_id]

        status, weights_df, controls_df = do_balancing(
            control_spec=seed_control_spec,
            total_hh_control_col=total_hh_control_col,
            max_expansion_factor=max_expansion_factor,
            incidence_df=seed_incidence_df,
            control_totals=seed_controls_df.loc[seed_id],
            initial_weights=seed_incidence_df['sample_weight'])

        logger.info("seed_balancer status: %s" % status)
        if not status['converged']:
            raise RuntimeError(
                "initial_seed_balancing for seed_id %s did not converge" %
                seed_id)

        balanced_weights = weights_df['final']

        logger.info("Total balanced weights for seed %s = %s" %
                    (seed_id, balanced_weights.sum()))

        weight_list.append(balanced_weights)

    # bulk concat all seed level results
    weights = pd.concat(weight_list)

    # build canonical weights table
    seed_weights_df = incidence_df[[seed_geography]].copy()
    seed_weights_df['preliminary_balanced_weight'] = weights

    # copy household_id_col index to named column
    seed_weights_df[setting('household_id_col')] = seed_weights_df.index

    # this is just a convenience if there are no meta controls
    if inject.get_step_arg('final', default=False):
        seed_weights_df['balanced_weight'] = seed_weights_df[
            'preliminary_balanced_weight']

    inject.add_table(weight_table_name(seed_geography), seed_weights_df)