def input_pre_processor():
    """
    Read input text files and save them as pipeline tables for use in subsequent steps.

    The files to read as specified by table_list, and array of dicts that specify the
    input file name, the name of the pipeline table, along with keys allow the specification
    of pre-processing steps.

    By default, reads table_list from 'input_table_list' in settings.yaml,
    unless an alternate table_list name is specified as a model step argument 'table_list'.
    (This allows alternate/additional input files to be read for repop)

    In the case of repop, this step is being run after an initial run has completed,
    in which case the input_table_list may specify replacement tables.
    (e.g. lowest geography controls that will replace the previous low controls dataframe.)

    See input_table_list in settings.yaml in the example folder for a working example

    +--------------+----------------------------------------------------------+
    | key          | description                                              |
    +==============+=========================================+================+
    | tablename    | name of pipeline table in which to store dataframe       |
    +--------------+----------------------------------------------------------+
    | filename     | name of csv file to read (in data_dir)                   |
    +--------------+----------------------------------------------------------+
    | column_map   | list of input columns to rename from_name: to_name       |
    +--------------+----------------------------------------------------------+
    | index_col    | name of column to set as dataframe index column          |
    +--------------+----------------------------------------------------------+
    | drop_columns | list of column names of columns to drop                  |
    +--------------+----------------------------------------------------------+

    """

    # alternate table list name may have been provided as a model argument
    table_list_name = inject.get_step_arg('table_list',
                                          default='input_table_list')
    table_list = config.setting(table_list_name)

    assert table_list is not None, "no table list '%s' found in settings." % table_list_name

    logger.info('Using table list: %s' % table_list)

    for table_info in table_list:

        tablename = table_info.get('tablename')
        df = input.read_from_table_info(table_info)
        logger.info('registering table %s' % tablename)

        # add (or replace) pipeline table
        repop = inject.get_step_arg('repop', default=False)
        inject.add_table(tablename, df, replace=repop)
Beispiel #2
0
def step_add_col():

    table_name = inject.get_step_arg('table_name')
    assert table_name is not None

    col_name = inject.get_step_arg('column_name')
    assert col_name is not None

    table = pipeline.get_table(table_name)

    assert col_name not in table.columns

    table[col_name] = table.index + (1000 * len(table.columns))

    pipeline.replace_table(table_name, table)
Beispiel #3
0
def step2():

    table_name = inject.get_step_arg('table_name')
    assert table_name is not None

    table2 = pd.DataFrame({'column1': [10, 20, 30]})
    inject.add_table(table_name, table2)
Beispiel #4
0
def step_forget_tab():

    table_name = inject.get_step_arg('table_name')
    assert table_name is not None

    table = pipeline.get_table(table_name)

    pipeline.drop_table(table_name)
Beispiel #5
0
def out_table(table_name, df):

    table_name = "summary_%s" % table_name

    if AS_CSV:
        file_name = "%s.csv" % table_name
        output_dir = inject.get_injectable('output_dir')
        file_path = os.path.join(output_dir, file_name)
        logger.info("writing output file %s" % file_path)
        write_index = df.index.name is not None
        df.to_csv(file_path, index=write_index)
    else:
        logger.info("saving summary table %s" % table_name)
        repop = inject.get_step_arg('repop', default=False)
        inject.add_table(table_name, df, replace=repop)
Beispiel #6
0
def annotate_table(configs_dir):

    # model_settings name should have been provided as a step argument
    model_name = inject.get_step_arg('model_name')

    model_settings = config.read_model_settings(configs_dir, '%s.yaml' % model_name)

    df_name = model_settings['DF']
    df = inject.get_table(df_name).to_frame()

    results = expressions.compute_columns(
        df,
        model_settings=model_settings,
        configs_dir=configs_dir,
        trace_label=None)

    assign_in_place(df, results)

    pipeline.replace_table(df_name, df)
Beispiel #7
0
def input_pre_processor():
    """
    Read input text files and save them as pipeline tables for use in subsequent steps.

    The files to read as specified by table_list, and array of dicts that specify the
    input file name, the name of the pipeline table, along with keys allow the specification
    of pre-processing steps.

    By default, reads table_list from 'input_table_list' in settings.yaml,
    unless an alternate table_list name is specified as a model step argument 'table_list'.
    (This allows alternate/additional input files to be read for repop)

    In the case of repop, this step is being run after an initial populationsim run has
    completed, in which case the input_table_list may specify replacement tables.
    (e.g. lowest geography controls that will replace the previous low controls dataframe.)

    See input_table_list in settings.yaml in the example folder for a working example

    +--------------+----------------------------------------------------------+
    | key          | description                                              |
    +==============+=========================================+================+
    | tablename    |  ame of pipeline table in which to store dataframe       |
    +--------------+----------------------------------------------------------+
    | filename     | name of csv file to read (in data_dir)                   |
    +--------------+----------------------------------------------------------+
    | column_map   | list of input columns to rename from_name: to_name       |
    +--------------+----------------------------------------------------------+
    | index_col    | name of column to set as dataframe index column          |
    +--------------+----------------------------------------------------------+
    | drop_columns | list of column names of columns to drop                  |
    +--------------+----------------------------------------------------------+

    """

    # alternate table list name may have been provided as a model argument
    table_list_name = inject.get_step_arg('table_list',
                                          default='input_table_list')
    table_list = setting(table_list_name)
    assert table_list is not None, "table list '%s' not in settings." % table_list_name

    data_dir = data_dir_from_settings()

    for table_info in table_list:

        tablename = table_info['tablename']

        logger.info("input_pre_processor processing %s" % tablename)

        # read the csv file
        data_filename = table_info.get('filename', None)
        data_file_path = os.path.join(data_dir, data_filename)
        if not os.path.exists(data_file_path):
            raise RuntimeError(
                "input_pre_processor %s - input file not found: %s" % (
                    tablename,
                    data_file_path,
                ))

        logger.info("Reading csv file %s" % data_file_path)
        df = pd.read_csv(data_file_path, comment='#')

        logger.info("input file columns: %s" % df.columns.values)

        drop_columns = table_info.get('drop_columns', None)
        if drop_columns:
            for c in drop_columns:
                logger.info("dropping column '%s'" % c)
                del df[c]

        # rename columns
        column_map = table_info.get('column_map', None)
        if column_map:
            df.rename(columns=column_map, inplace=True)

        # set index
        index_col = table_info.get('index_col', None)
        if index_col is not None:
            if index_col in df.columns:
                assert not df.duplicated(index_col).any()
                df.set_index(index_col, inplace=True)
            else:
                df.index.names = [index_col]

        # read expression file
        # expression_filename = table_info.get('expression_filename', None)
        # if expression_filename:
        #     assert False
        #     expression_file_path = os.path.join(configs_dir, expression_filename)
        #     if not os.path.exists(expression_file_path):
        #         raise RuntimeError("input_pre_processor %s - expression file not found: %s"
        #                            % (table, expression_file_path, ))
        #     spec = assign.read_assignment_spec(expression_file_path)
        #
        #     df_alias = table_info.get('df_alias', table)
        #
        #     locals_d = {}
        #
        #     results, trace_results, trace_assigned_locals \
        #         = assign.assign_variables(spec, df, locals_d, df_alias=df_alias)
        #     # for column in results.columns:
        #     #     orca.add_column(table, column, results[column])
        #
        #     df = pd.concat([df, results], axis=1)

        logger.info("adding table %s" % tablename)

        # add (or replace) pipeline table
        repop = inject.get_step_arg('repop', default=False)
        inject.add_table(tablename, df, replace=repop)
Beispiel #8
0
def initial_seed_balancing(settings, crosswalk, control_spec, incidence_table):
    """
    Balance the household weights for each of the seed geographies (independently)
    using the seed level controls and the aggregated sub-zone controls totals.

    Create the seed_weights table with one row per household and columns contaiing
    household_id, seed geography (e.g. PUMA), and float preliminary_balanced_weights

    Adds seed_weights table to pipeline named <seed_geography>_weights (e.g. PUMA_weights):

    +--------+------+-----------------------------+-------+
    | index  | PUMA | preliminary_balanced_weight | hh_id |
    | hh_id  |      |                             |       |
    +========+======+=============================+=======+
    | 0      | 600  |                   0.313555  |    0  |
    | 1      | 601  |                   0.627110  |    1  |
    | 2      | 602  |                   0.313555  |    2  |
    | ...    |      |                             |       |
    +--------+------+-----------------------------+-------+

    Parameters
    ----------
    settings : dict (settings.yaml as dict)
    crosswalk : pipeline table
    control_spec : pipeline table
    incidence_table : pipeline table

    """
    crosswalk_df = crosswalk.to_frame()
    incidence_df = incidence_table.to_frame()
    control_spec = control_spec.to_frame()

    seed_geography = settings.get('seed_geography')
    seed_controls_df = get_control_table(seed_geography)

    # only want control_spec rows for seed geography and below
    geographies = settings['geographies']
    seed_geographies = geographies[geographies.index(seed_geography):]
    seed_control_spec = control_spec[control_spec['geography'].isin(
        seed_geographies)]

    # determine master_control_index if specified in settings
    total_hh_control_col = setting('total_hh_control')

    max_expansion_factor = settings.get('max_expansion_factor', None)
    min_expansion_factor = settings.get('min_expansion_factor', None)

    # run balancer for each seed geography
    weight_list = []
    sample_weight_list = []

    seed_ids = crosswalk_df[seed_geography].unique()
    for seed_id in seed_ids:

        logger.info("initial_seed_balancing seed id %s" % seed_id)

        seed_incidence_df = incidence_df[incidence_df[seed_geography] ==
                                         seed_id]

        status, weights_df, controls_df = do_balancing(
            control_spec=seed_control_spec,
            total_hh_control_col=total_hh_control_col,
            max_expansion_factor=max_expansion_factor,
            min_expansion_factor=min_expansion_factor,
            incidence_df=seed_incidence_df,
            control_totals=seed_controls_df.loc[seed_id],
            initial_weights=seed_incidence_df['sample_weight'])

        logger.info("seed_balancer status: %s" % status)
        if not status['converged']:
            raise RuntimeError(
                "initial_seed_balancing for seed_id %s did not converge" %
                seed_id)

        balanced_weights = weights_df['final']

        logger.info("Total balanced weights for seed %s = %s" %
                    (seed_id, balanced_weights.sum()))

        weight_list.append(balanced_weights)
        sample_weight_list.append(seed_incidence_df['sample_weight'])

    # bulk concat all seed level results
    weights = pd.concat(weight_list)
    sample_weights = pd.concat(sample_weight_list)

    # build canonical weights table
    seed_weights_df = incidence_df[[seed_geography]].copy()
    seed_weights_df['preliminary_balanced_weight'] = weights

    seed_weights_df['sample_weight'] = sample_weights

    # copy household_id_col index to named column
    seed_weights_df[setting('household_id_col')] = seed_weights_df.index

    # this is just a convenience if there are no meta controls
    if inject.get_step_arg('final', default=False):
        seed_weights_df['balanced_weight'] = seed_weights_df[
            'preliminary_balanced_weight']

    repop = inject.get_step_arg('repop', default=False)
    inject.add_table(weight_table_name(seed_geography),
                     seed_weights_df,
                     replace=repop)
Beispiel #9
0
def sub_balancing(settings, crosswalk, control_spec, incidence_table):
    """
    Simul-balance and integerize all zones at a specified geographic level
    in groups by parent zone.

    For instance, if the 'geography' step arg is 'TRACT' and the parent geography is 'SEED',
    then for each seed zone, we simul-balance the TRACTS it contains.

    Creates a weight table for the target geography
    with float 'balanced_weight' and 'integer_weight' columns.

    Parameters
    ----------
    settings : dict (settings.yaml as dict)
    crosswalk : pipeline table
    control_spec : pipeline table
    incidence_table : pipeline table

    Returns
    -------

    """

    # geography is an injected model step arg
    geography = inject.get_step_arg('geography')

    crosswalk_df = crosswalk.to_frame()
    incidence_df = incidence_table.to_frame()
    control_spec = control_spec.to_frame()

    geographies = settings.get('geographies')
    seed_geography = settings.get('seed_geography')
    meta_geography = geographies[0]
    parent_geography = geographies[geographies.index(geography) - 1]

    sub_geographies = geographies[geographies.index(geography):]
    parent_geographies = geographies[:geographies.index(geography)]

    total_hh_control_col = setting('total_hh_control')

    parent_controls_df = get_control_table(parent_geography)
    sub_controls_df = get_control_table(geography)

    weights_df = get_weight_table(parent_geography)
    assert weights_df is not None

    integer_weights_list = []

    # the incidence table is siloed by seed geography, se we handle each seed zone in turn
    seed_ids = crosswalk_df[seed_geography].unique()
    for seed_id in seed_ids:

        # slice incidence and crosswalk tables for this seed zone
        seed_incidence_df = incidence_df[incidence_df[seed_geography] ==
                                         seed_id]
        seed_crosswalk_df = crosswalk_df[crosswalk_df[seed_geography] ==
                                         seed_id]

        assert len(seed_crosswalk_df[meta_geography].unique()) == 1

        # list of unique parent zone ids in this seed zone
        # (there will be just one if parent geo is seed)
        parent_ids = seed_crosswalk_df[parent_geography].unique()
        # only want ones for which there are (non-zero) controls
        parent_ids = parent_controls_df.index.intersection(parent_ids)

        for parent_id in parent_ids:

            logger.info("balancing seed %s, %s %s" %
                        (seed_id, parent_geography, parent_id))

            initial_weights = weights_df[weights_df[parent_geography] ==
                                         parent_id]
            initial_weights = initial_weights.set_index(
                settings.get('household_id_col'))

            # using balanced_weight slows down simul and doesn't improve results
            # (float seeds means no zero-weight households to drop)
            if setting('SUB_BALANCE_WITH_FLOAT_SEED_WEIGHTS', True):
                initial_weights = initial_weights['balanced_weight']
            else:
                initial_weights = initial_weights['integer_weight']

            assert len(initial_weights.index) == len(seed_incidence_df.index)

            zone_weights_df = balance_and_integerize(
                incidence_df=seed_incidence_df,
                parent_weights=initial_weights,
                sub_controls_df=sub_controls_df,
                control_spec=control_spec,
                total_hh_control_col=total_hh_control_col,
                parent_geography=parent_geography,
                parent_id=parent_id,
                sub_geographies=sub_geographies,
                crosswalk_df=seed_crosswalk_df)

            # add higher level geography id columns to facilitate summaries
            parent_geography_ids = \
                crosswalk_df.loc[crosswalk_df[parent_geography] == parent_id, parent_geographies]\
                .max(axis=0)
            for z in parent_geography_ids.index:
                zone_weights_df[z] = parent_geography_ids[z]

            integer_weights_list.append(zone_weights_df)

    integer_weights_df = pd.concat(integer_weights_list)

    inject.add_table(weight_table_name(geography), integer_weights_df)
    inject.add_table(
        weight_table_name(geography, sparse=True),
        integer_weights_df[integer_weights_df['integer_weight'] > 0])

    if 'trace_geography' in settings and geography in settings[
            'trace_geography']:
        trace_geography_id = settings.get('trace_geography')[geography]
        df = integer_weights_df[integer_weights_df[geography] ==
                                trace_geography_id]
        inject.add_table('trace_%s' % weight_table_name(geography), df)
def expand_households():
    """
    Create a complete expanded synthetic household list with their assigned geographic zone ids.

    This is the skeleton synthetic household id list with no household or person attributes,
    one row per household, with geography columns and seed household table household_id.

    Creates pipeline table expanded_household_ids
    """

    if setting('NO_INTEGERIZATION_EVER', False):
        logger.warning("skipping expand_households: NO_INTEGERIZATION_EVER")
        inject.add_table('expanded_household_ids', pd.DataFrame())
        return

    geographies = setting('geographies')
    household_id_col = setting('household_id_col')

    low_geography = geographies[-1]

    # only one we really need is low_geography
    seed_geography = setting('seed_geography')
    geography_cols = geographies[geographies.index(seed_geography):]

    weights = get_weight_table(low_geography, sparse=True)
    weights = weights[geography_cols + [household_id_col, 'integer_weight']]

    # - expand weights table by integer_weight, so there is one row per desired hh
    weight_cols = weights.columns.values
    weights_np = np.repeat(weights.values,
                           weights.integer_weight.values,
                           axis=0)
    expanded_weights = pd.DataFrame(data=weights_np, columns=weight_cols)

    if setting('GROUP_BY_INCIDENCE_SIGNATURE'):

        # the household_id_col is really the group_id
        expanded_weights.rename(columns={household_id_col: 'group_id'},
                                inplace=True)

        # the original incidence table with one row per hh, with index hh_id
        household_groups = pipeline.get_table('household_groups')
        household_groups = household_groups[[
            household_id_col, 'group_id', 'sample_weight'
        ]]

        # for each group, lists of hh_ids and their sample_weights (as relative probabiliities)
        # [ [ [<group_0_hh_id_list>], [<group_0_hh_prob_list>] ],
        #   [ [<group_1_hh_id_list>], [<group_1_hh_prob_list>] ], ... ]
        HH_IDS = 0
        HH_PROBS = 1
        grouper = household_groups.groupby('group_id')
        group_hh_probs = [0] * len(grouper)
        for group_id, df in grouper:
            hh_ids = list(df[household_id_col])
            probs = list(df.sample_weight / df.sample_weight.sum())
            group_hh_probs[group_id] = [hh_ids, probs]

        # FIXME - should sample without replacement?
        # now make a hh_id choice for each group_id in expanded_weights
        def chooser(group_id):
            hh_ids = group_hh_probs[group_id][HH_IDS]
            hh_probs = group_hh_probs[group_id][HH_PROBS]
            return np.random.choice(hh_ids, p=hh_probs)
        expanded_weights[household_id_col] = \
            expanded_weights.group_id.apply(chooser, convert_dtype=True,)

        # FIXME - omit in production?
        del expanded_weights['group_id']
        del expanded_weights['integer_weight']

    append = inject.get_step_arg('append', False)
    replace = inject.get_step_arg('replace', False)
    assert not (
        append and
        replace), "can't specify both append and replace for expand_households"

    if append or replace:
        t = inject.get_table('expanded_household_ids').to_frame()
        prev_hhs = len(t.index)
        added_hhs = len(expanded_weights.index)

        if replace:
            # FIXME - should really get from crosswalk table?
            low_ids_to_replace = expanded_weights[low_geography].unique()
            t = t[~t[low_geography].isin(low_ids_to_replace)]

        expanded_weights = pd.concat([t, expanded_weights], ignore_index=True)

        dropped_hhs = prev_hhs - len(t.index)
        final_hhs = len(expanded_weights.index)
        op = 'append' if append else 'replace'
        logger.info(
            "expand_households op: %s prev hh count %s dropped %s added %s final %s"
            % (op, prev_hhs, dropped_hhs, added_hhs, final_hhs))

    repop = inject.get_step_arg('repop', default=False)
    inject.add_table('expanded_household_ids', expanded_weights, replace=repop)
Beispiel #11
0
def sub_balancing(settings, crosswalk, control_spec, incidence_table):

    # geography is an injected model step arg
    geography = inject.get_step_arg('geography')

    crosswalk_df = crosswalk.to_frame()
    incidence_df = incidence_table.to_frame()
    control_spec = control_spec.to_frame()

    geographies = settings.get('geographies')
    seed_geography = settings.get('seed_geography')
    meta_geography = geographies[0]
    parent_geography = geographies[geographies.index(geography) - 1]

    sub_geographies = geographies[geographies.index(geography):]
    parent_geographies = geographies[:geographies.index(geography)]

    total_hh_control_col = settings.get('total_hh_control')

    sub_controls_df = get_control_table(geography)

    weights_df = get_weight_table(parent_geography)
    assert weights_df is not None

    integer_weights_list = []

    seed_ids = crosswalk_df[seed_geography].unique()
    for seed_id in seed_ids:

        seed_incidence_df = incidence_df[incidence_df[seed_geography] ==
                                         seed_id]
        seed_crosswalk_df = crosswalk_df[crosswalk_df[seed_geography] ==
                                         seed_id]

        assert len(seed_crosswalk_df[meta_geography].unique()) == 1

        parent_ids = seed_crosswalk_df[parent_geography].unique()

        for parent_id in parent_ids:

            logger.info("balancing seed %s, %s %s" %
                        (seed_id, parent_geography, parent_id))

            initial_weights = weights_df[weights_df[parent_geography] ==
                                         parent_id]
            initial_weights = initial_weights.set_index(
                settings.get('household_id_col'))

            # using balanced_weight slows down simul and doesn't improve results
            # (float seeds means no zero-weight households to drop)
            if setting('SUB_BALANCE_WITH_FLOAT_SEED_WEIGHTS', True):
                initial_weights = initial_weights['balanced_weight']
            else:
                initial_weights = initial_weights['integer_weight']

            assert len(initial_weights.index) == len(seed_incidence_df.index)

            zone_weights_df = balance_and_integerize(
                incidence_df=seed_incidence_df,
                parent_weights=initial_weights,
                sub_controls_df=sub_controls_df,
                control_spec=control_spec,
                total_hh_control_col=total_hh_control_col,
                parent_geography=parent_geography,
                parent_id=parent_id,
                sub_geographies=sub_geographies,
                crosswalk_df=seed_crosswalk_df)

            # add higher level geography id columns to facilitate summaries
            parent_geography_ids = \
                crosswalk_df.loc[crosswalk_df[parent_geography] == parent_id, parent_geographies]\
                .max(axis=0)
            for z in parent_geography_ids.index:
                zone_weights_df[z] = parent_geography_ids[z]

            integer_weights_list.append(zone_weights_df)

    integer_weights_df = pd.concat(integer_weights_list)

    inject.add_table(weight_table_name(geography), integer_weights_df)
    inject.add_table(
        weight_table_name(geography, sparse=True),
        integer_weights_df[integer_weights_df['integer_weight'] > 0])

    if 'trace_geography' in settings and geography in settings[
            'trace_geography']:
        trace_geography_id = settings.get('trace_geography')[geography]
        df = integer_weights_df[integer_weights_df[geography] ==
                                trace_geography_id]
        inject.add_table('trace_%s' % weight_table_name(geography), df)
def input_pre_processor():

    # alternate table list name may have been provided as a model argument
    table_list_name = inject.get_step_arg('table_list',
                                          default='input_table_list')
    table_list = setting(table_list_name)
    assert table_list is not None, "table list '%s' not in settings." % table_list_name

    data_dir = data_dir_from_settings()

    for table_info in table_list:

        tablename = table_info['tablename']

        logger.info("input_pre_processor processing %s" % tablename)

        # read the csv file
        data_filename = table_info.get('filename', None)
        data_file_path = os.path.join(data_dir, data_filename)
        if not os.path.exists(data_file_path):
            raise RuntimeError(
                "input_pre_processor %s - input file not found: %s" % (
                    tablename,
                    data_file_path,
                ))

        logger.info("Reading csv file %s" % data_file_path)
        df = pd.read_csv(data_file_path, comment='#')

        print df.columns

        drop_columns = table_info.get('drop_columns', None)
        if drop_columns:
            for c in drop_columns:
                logger.info("dropping column '%s'" % c)
                del df[c]

        # rename columns
        column_map = table_info.get('column_map', None)
        if column_map:
            df.rename(columns=column_map, inplace=True)

        # set index
        index_col = table_info.get('index_col', None)
        if index_col is not None:
            if index_col in df.columns:
                assert not df.duplicated(index_col).any()
                df.set_index(index_col, inplace=True)
            else:
                df.index.names = [index_col]

        # read expression file
        # expression_filename = table_info.get('expression_filename', None)
        # if expression_filename:
        #     assert False
        #     expression_file_path = os.path.join(configs_dir, expression_filename)
        #     if not os.path.exists(expression_file_path):
        #         raise RuntimeError("input_pre_processor %s - expression file not found: %s"
        #                            % (table, expression_file_path, ))
        #     spec = assign.read_assignment_spec(expression_file_path)
        #
        #     df_alias = table_info.get('df_alias', table)
        #
        #     locals_d = {}
        #
        #     results, trace_results, trace_assigned_locals \
        #         = assign.assign_variables(spec, df, locals_d, df_alias=df_alias)
        #     # for column in results.columns:
        #     #     orca.add_column(table, column, results[column])
        #
        #     df = pd.concat([df, results], axis=1)

        logger.info("adding table %s" % tablename)

        inject.add_table(tablename, df)
def initial_seed_balancing(settings, crosswalk, control_spec, incidence_table):

    crosswalk_df = crosswalk.to_frame()
    incidence_df = incidence_table.to_frame()
    control_spec = control_spec.to_frame()

    seed_geography = settings.get('seed_geography')
    seed_controls_df = get_control_table(seed_geography)

    # only want control_spec rows for sub_geographies
    geographies = settings['geographies']
    sub_geographies = geographies[geographies.index(seed_geography) + 1:]
    seed_control_spec = control_spec[control_spec['geography'].isin(
        sub_geographies)]

    # determine master_control_index if specified in settings
    total_hh_control_col = settings.get('total_hh_control')

    max_expansion_factor = settings.get('max_expansion_factor', None)

    # run balancer for each seed geography
    weight_list = []

    seed_ids = crosswalk_df[seed_geography].unique()
    for seed_id in seed_ids:

        logger.info("initial_seed_balancing seed id %s" % seed_id)

        seed_incidence_df = incidence_df[incidence_df[seed_geography] ==
                                         seed_id]

        status, weights_df, controls_df = do_balancing(
            control_spec=seed_control_spec,
            total_hh_control_col=total_hh_control_col,
            max_expansion_factor=max_expansion_factor,
            incidence_df=seed_incidence_df,
            control_totals=seed_controls_df.loc[seed_id],
            initial_weights=seed_incidence_df['sample_weight'])

        logger.info("seed_balancer status: %s" % status)
        if not status['converged']:
            raise RuntimeError(
                "initial_seed_balancing for seed_id %s did not converge" %
                seed_id)

        balanced_weights = weights_df['final']

        logger.info("Total balanced weights for seed %s = %s" %
                    (seed_id, balanced_weights.sum()))

        weight_list.append(balanced_weights)

    # bulk concat all seed level results
    weights = pd.concat(weight_list)

    # build canonical weights table
    seed_weights_df = incidence_df[[seed_geography]].copy()
    seed_weights_df['preliminary_balanced_weight'] = weights

    # copy household_id_col index to named column
    seed_weights_df[setting('household_id_col')] = seed_weights_df.index

    # this is just a convenience if there are no meta controls
    if inject.get_step_arg('final', default=False):
        seed_weights_df['balanced_weight'] = seed_weights_df[
            'preliminary_balanced_weight']

    inject.add_table(weight_table_name(seed_geography), seed_weights_df)