Beispiel #1
0
def setup_data_structures(output_dir, settings, configs_dir, households,
                          persons):

    seed_geography = setting('seed_geography')

    households_df = households.to_frame()
    persons_df = persons.to_frame()

    # # remove mixed type fields
    # del persons_df["RT"]
    # del persons_df["indp02"]
    # del persons_df["naicsp02"]
    # del persons_df["occp02"]
    # del persons_df["socp00"]
    # del persons_df["occp10"]
    # del persons_df["socp10"]
    # del persons_df["indp07"]
    # del persons_df["naicsp07"]
    #
    # file_path = os.path.join(output_dir, "cleaned_persons.csv")
    # write_index = persons_df.index.name is not None
    # persons_df.to_csv(file_path, index=write_index)
    #
    # assert False

    crosswalk_df = build_crosswalk_table()
    inject.add_table('crosswalk', crosswalk_df)

    control_spec = read_control_spec(
        setting('control_file_name', 'controls.csv'), configs_dir)
    inject.add_table('control_spec', control_spec)

    geographies = settings['geographies']
    for g in geographies:
        controls = build_control_table(g, control_spec, crosswalk_df)
        inject.add_table(control_table_name(g), controls)

    households_df, persons_df = filter_households(households_df, persons_df,
                                                  crosswalk_df)
    pipeline.replace_table('households', households_df)
    pipeline.replace_table('persons', persons_df)

    incidence_table = \
        build_incidence_table(control_spec, households_df, persons_df, crosswalk_df)

    incidence_table = add_geography_columns(incidence_table, households_df,
                                            crosswalk_df)

    # add sample_weight col to incidence table
    hh_weight_col = setting('household_weight_col')
    incidence_table['sample_weight'] = households_df[hh_weight_col]

    if setting('GROUP_BY_INCIDENCE_SIGNATURE'):
        group_incidence_table, household_groups \
            = build_grouped_incidence_table(incidence_table, control_spec, seed_geography)

        inject.add_table('household_groups', household_groups)
        inject.add_table('incidence_table', group_incidence_table)
    else:
        inject.add_table('incidence_table', incidence_table)
Beispiel #2
0
def add_geography_columns(incidence_table, households_df, crosswalk_df):
    """
    Add seed and meta geography columns to incidence_table

    Parameters
    ----------
    incidence_table
    households_df
    crosswalk_df

    Returns
    -------

    """

    geographies = setting('geographies')
    meta_geography = geographies[0]
    seed_geography = setting('seed_geography')

    # add seed_geography col to incidence table
    incidence_table[seed_geography] = households_df[seed_geography]

    # add meta column to incidence table
    seed_to_meta = \
        crosswalk_df[[seed_geography, meta_geography]] \
        .groupby(seed_geography, as_index=True).min()[meta_geography]
    incidence_table[meta_geography] = incidence_table[seed_geography].map(
        seed_to_meta)

    return incidence_table
Beispiel #3
0
def meta_summary(incidence_df, control_spec, top_geography, top_id,
                 sub_geographies):

    if setting('NO_INTEGERIZATION_EVER', False):
        seed_weight_cols = ['preliminary_balanced_weight', 'balanced_weight']
        sub_weight_cols = ['balanced_weight']
    else:
        seed_weight_cols = [
            'preliminary_balanced_weight', 'balanced_weight', 'integer_weight'
        ]
        sub_weight_cols = ['balanced_weight', 'integer_weight']

    incidence_df = incidence_df[incidence_df[top_geography] == top_id]

    control_cols = control_spec.target.values

    controls_df = get_control_table(top_geography)

    # controls for this geography as series
    controls = controls_df[control_cols].loc[top_id]

    incidence = incidence_df[control_cols]

    summary = pd.DataFrame(index=control_cols)

    summary.index.name = 'control_name'

    summary['control_value'] = controls

    seed_geography = setting('seed_geography')
    seed_weights_df = get_weight_table(seed_geography)

    for c in seed_weight_cols:
        if c in seed_weights_df:
            summary_col_name = '%s_%s' % (top_geography, c)
            summary[summary_col_name] = \
                incidence.multiply(seed_weights_df[c], axis="index").sum(axis=0)

    for g in sub_geographies:

        sub_weights = get_weight_table(g)

        if sub_weights is None:
            continue

        sub_weights = sub_weights[sub_weights[top_geography] == top_id]

        sub_weights = sub_weights[['hh_id'] +
                                  sub_weight_cols].groupby('hh_id').sum()

        for c in sub_weight_cols:
            summary['%s_%s' % (g, c)] = \
                incidence.multiply(sub_weights[c], axis="index").sum(axis=0)

    return summary
def build_grouped_incidence_table(incidence_table, control_spec, seed_geography):

    hh_incidence_table = incidence_table
    household_id_col = setting('household_id_col')

    hh_groupby_cols = list(control_spec.target) + [seed_geography]
    hh_grouper = hh_incidence_table.groupby(hh_groupby_cols)
    group_incidence_table = hh_grouper.max()
    group_incidence_table['sample_weight'] = hh_grouper.sum()['sample_weight']
    group_incidence_table['group_size'] = hh_grouper.count()['sample_weight']
    group_incidence_table = group_incidence_table.reset_index()

    logger.info("grouped incidence table has %s entries, ungrouped has %s"
                % (len(group_incidence_table.index), len(hh_incidence_table.index)))

    # add group_id of each hh to hh_incidence_table
    group_incidence_table['group_id'] = group_incidence_table.index
    hh_incidence_table['group_id'] = hh_incidence_table[hh_groupby_cols].merge(
        group_incidence_table[hh_groupby_cols + ['group_id']],
        on=hh_groupby_cols,
        how='left').group_id.astype(int).values

    # it doesn't really matter what the incidence_table index is until we create population
    # when we need to expand each group to constituent households
    # but incidence_table should have the same name whether grouped or ungrouped
    # so that the rest of the steps can handle them interchangeably
    group_incidence_table.index.name = hh_incidence_table.index.name

    # create table mapping household_groups to households and their sample_weights
    # explicitly provide hh_id as a column to make it easier for use when expanding population
    household_groups = hh_incidence_table[['group_id', 'sample_weight']].copy()
    household_groups[household_id_col] = household_groups.index.astype(int)

    return group_incidence_table, household_groups
Beispiel #5
0
def filter_households(households_df, persons_df, crosswalk_df):

    # drop any zero weight households (there are some in calm data)
    hh_weight_col = setting('household_weight_col')
    households_df = households_df[households_df[hh_weight_col] > 0]

    # remove any households not in seed zones
    seed_geography = setting('seed_geography')
    seed_ids = crosswalk_df[seed_geography].unique()

    rows_in_seed_zones = households_df[seed_geography].isin(seed_ids)
    if rows_in_seed_zones.any():
        households_df = households_df[rows_in_seed_zones]
        logger.info("dropped %s households not in seed zones" %
                    (~rows_in_seed_zones).sum())
        logger.info("kept %s households in seed zones" % len(households_df))

    return households_df, persons_df
Beispiel #6
0
def repop_setup_data_structures(settings, configs_dir, households, persons):

    seed_geography = setting('seed_geography')
    geographies = setting('geographies')
    low_geography = geographies[-1]

    # replace crosswalk table
    crosswalk_df = build_crosswalk_table()
    pipeline.replace_table('crosswalk', crosswalk_df)

    # replace control_spec
    control_file_name = setting('repop_control_file_name',
                                'repop_controls.csv')
    control_spec = read_control_spec(control_file_name, configs_dir)
    pipeline.replace_table('control_spec', control_spec)

    # build incidence_table with repop controls and households in repop zones
    # filter households (dropping any not in crosswalk) in order to build incidence_table
    # We DO NOT REPLACE households and persons as we need full tables to synthesize population
    households_df = households.to_frame()
    persons_df = persons.to_frame()
    households_df, repop_persons_df = filter_households(
        households_df, persons_df, crosswalk_df)
    incidence_table = build_incidence_table(control_spec, households_df,
                                            persons_df, crosswalk_df)
    incidence_table = add_geography_columns(incidence_table, households_df,
                                            crosswalk_df)
    # add sample_weight col to incidence table
    hh_weight_col = setting('household_weight_col')
    incidence_table['sample_weight'] = households_df[hh_weight_col]

    # rebuild control tables with only the low level controls (aggregated at higher levels)
    for g in geographies:
        controls = build_control_table(g, control_spec, crosswalk_df)
        pipeline.replace_table(control_table_name(g), controls)

    if setting('GROUP_BY_INCIDENCE_SIGNATURE'):
        group_incidence_table, household_groups \
            = build_grouped_incidence_table(incidence_table, control_spec, seed_geography)

        pipeline.replace_table('household_groups', household_groups)
        pipeline.replace_table('incidence_table', group_incidence_table)
    else:
        pipeline.replace_table('incidence_table', incidence_table)
Beispiel #7
0
def summarize_geography(geography, weight_col, crosswalk_df, results_df,
                        incidence_df):

    # controls_table for current geography level
    controls_df = get_control_table(geography)
    control_names = controls_df.columns.tolist()

    # only want zones from crosswalk for which non-zero control rows exist
    zone_ids = crosswalk_df[geography].unique()
    zone_ids = controls_df.index.intersection(zone_ids)

    results = []
    controls = []
    for zone_id in zone_ids:

        zone_controls = controls_df.loc[zone_id].tolist()

        controls.append(zone_controls)

        zone_row_map = results_df[geography] == zone_id
        zone_weights = results_df[zone_row_map]

        incidence = incidence_df.loc[zone_weights[setting('household_id_col')]]

        weights = zone_weights[weight_col].tolist()
        x = [(incidence[c] * weights).sum() for c in control_names]
        results.append(x)

    controls_df = pd.DataFrame(
        data=np.asanyarray(controls),
        columns=['%s_control' % c for c in control_names],
        index=zone_ids)

    summary_df = pd.DataFrame(data=np.asanyarray(results),
                              columns=['%s_result' % c for c in control_names],
                              index=zone_ids)

    dif_df = pd.DataFrame(data=np.asanyarray(results) -
                          np.asanyarray(controls),
                          columns=['%s_diff' % c for c in control_names],
                          index=zone_ids)

    summary_df = pd.concat([controls_df, summary_df, dif_df], axis=1)

    summary_cols = summary_df.columns.tolist()

    summary_df['geography'] = geography
    summary_df['id'] = summary_df.index
    summary_df.index = summary_df['geography'] + '_' + summary_df['id'].astype(
        str)
    summary_df = summary_df[['geography', 'id'] + summary_cols]

    return summary_df
def merge_seed_data(expanded_household_ids, seed_data_df, seed_columns, trace_label):

    seed_geography = setting('seed_geography')
    hh_col = setting('household_id_col')

    df_columns = seed_data_df.columns.values

    # warn of any columns that aren't in seed_data_df
    for c in seed_columns:
        if c not in df_columns and c != hh_col:
            logger.warn("column '%s' not in %s" % (c, trace_label))

    # remove any columns that aren't in seed_data_df
    df_columns = [c for c in seed_columns if c in df_columns]

    # seed_geography column in seed_data_df is redundant (already in expanded_household_ids table)
    if seed_geography in df_columns:
        df_columns.remove(seed_geography)

    # join to seed_data on either index or hh_col (for persons)
    right_index = (seed_data_df.index.name == hh_col)
    right_on = hh_col if hh_col in seed_data_df.columns and not right_index else None
    assert right_index or right_on

    if right_on and hh_col not in df_columns:
        df_columns.append(hh_col)

    merged_df = pd.merge(
        how="left",
        left=expanded_household_ids,
        right=seed_data_df[df_columns],
        left_on=hh_col,
        right_index=right_index,
        right_on=right_on
    )

    if hh_col not in seed_columns:
        del merged_df[hh_col]

    return merged_df
def build_crosswalk_table():
    """
    build crosswalk table filtered to include only zones in lowest geography
    """

    geographies = setting('geographies')

    crosswalk_data_table = inject.get_table('geo_cross_walk').to_frame()

    # dont need any other geographies
    crosswalk = crosswalk_data_table[geographies]

    # filter geo_cross_walk_df to only include geo_ids with lowest_geography controls
    # (just in case geo_cross_walk_df table contains rows for unused low zones)
    low_geography = geographies[-1]
    low_control_data_df = get_control_data_table(low_geography)
    rows_in_low_controls = crosswalk[low_geography].isin(low_control_data_df[low_geography])
    crosswalk = crosswalk[rows_in_low_controls]

    return crosswalk
def read_control_spec(data_filename, configs_dir):

    # read the csv file
    data_file_path = os.path.join(configs_dir, data_filename)
    if not os.path.exists(data_file_path):
        raise RuntimeError(
            "initial_seed_balancing - control file not found: %s" % (data_file_path,))

    logger.info("Reading control file %s" % data_file_path)
    control_spec = pd.read_csv(data_file_path, comment='#')

    geographies = setting('geographies')

    if 'geography' not in control_spec.columns:
        raise RuntimeError("missing geography column in controls file")

    for g in control_spec.geography.unique():
        if g not in geographies:
            raise RuntimeError("unknown geography column '%s' in control file" % g)

    return control_spec
Beispiel #11
0
def build_incidence_table(control_spec, households_df, persons_df,
                          crosswalk_df):

    hh_col = setting('household_id_col')

    incidence_table = pd.DataFrame(index=households_df.index)

    seed_tables = {
        'households': households_df,
        'persons': persons_df,
    }

    for control_row in control_spec.itertuples():

        logger.info("control target %s" % control_row.target)
        logger.debug("control_row.seed_table %s" % control_row.seed_table)
        logger.debug("control_row.expression %s" % control_row.expression)

        incidence, trace_results = assign_variable(
            target=control_row.target,
            expression=control_row.expression,
            df=seed_tables[control_row.seed_table],
            locals_dict={'np': np},
            df_alias=control_row.seed_table,
            trace_rows=None)

        # convert boolean True/False values to 1/0
        incidence = incidence * 1

        # aggregate person incidence counts to household
        if control_row.seed_table == 'persons':
            df = pd.DataFrame({
                hh_col: persons_df[hh_col],
                'incidence': incidence
            })
            incidence = df.groupby([hh_col], as_index=True).sum()

        incidence_table[control_row.target] = incidence

    return incidence_table
Beispiel #12
0
def write_results(output_dir):

    output_tables_settings_name = 'output_tables'

    output_tables_settings = setting(output_tables_settings_name)

    output_tables = pipeline.checkpointed_tables()

    if output_tables_settings is not None:

        action = output_tables_settings.get('action')
        tables = output_tables_settings.get('tables')

        if action not in ['include', 'skip']:
            raise "expected %s action '%s' to be either 'include' or 'skip'" % \
                  (output_tables_settings_name, action)

        if action == 'include':
            output_tables = tables
        elif action == 'skip':
            output_tables = [t for t in output_tables if t not in tables]

    # should provide option to also write checkpoints?
    # output_tables.append("checkpoints.csv")

    for table_name in output_tables:
        table = inject.get_table(table_name, None)

        if table is None:
            logger.warn("Skipping '%s': Table not found." % table_name)
            continue

        df = table.to_frame()
        file_name = "%s.csv" % table_name
        logger.info("writing output file %s" % file_name)
        file_path = os.path.join(output_dir, file_name)
        write_index = df.index.name is not None
        df.to_csv(file_path, index=write_index)
def input_pre_processor():

    # alternate table list name may have been provided as a model argument
    table_list_name = inject.get_step_arg('table_list',
                                          default='input_table_list')
    table_list = setting(table_list_name)
    assert table_list is not None, "table list '%s' not in settings." % table_list_name

    data_dir = data_dir_from_settings()

    for table_info in table_list:

        tablename = table_info['tablename']

        logger.info("input_pre_processor processing %s" % tablename)

        # read the csv file
        data_filename = table_info.get('filename', None)
        data_file_path = os.path.join(data_dir, data_filename)
        if not os.path.exists(data_file_path):
            raise RuntimeError(
                "input_pre_processor %s - input file not found: %s" % (
                    tablename,
                    data_file_path,
                ))

        logger.info("Reading csv file %s" % data_file_path)
        df = pd.read_csv(data_file_path, comment='#')

        print df.columns

        drop_columns = table_info.get('drop_columns', None)
        if drop_columns:
            for c in drop_columns:
                logger.info("dropping column '%s'" % c)
                del df[c]

        # rename columns
        column_map = table_info.get('column_map', None)
        if column_map:
            df.rename(columns=column_map, inplace=True)

        # set index
        index_col = table_info.get('index_col', None)
        if index_col is not None:
            if index_col in df.columns:
                assert not df.duplicated(index_col).any()
                df.set_index(index_col, inplace=True)
            else:
                df.index.names = [index_col]

        # read expression file
        # expression_filename = table_info.get('expression_filename', None)
        # if expression_filename:
        #     assert False
        #     expression_file_path = os.path.join(configs_dir, expression_filename)
        #     if not os.path.exists(expression_file_path):
        #         raise RuntimeError("input_pre_processor %s - expression file not found: %s"
        #                            % (table, expression_file_path, ))
        #     spec = assign.read_assignment_spec(expression_file_path)
        #
        #     df_alias = table_info.get('df_alias', table)
        #
        #     locals_d = {}
        #
        #     results, trace_results, trace_assigned_locals \
        #         = assign.assign_variables(spec, df, locals_d, df_alias=df_alias)
        #     # for column in results.columns:
        #     #     orca.add_column(table, column, results[column])
        #
        #     df = pd.concat([df, results], axis=1)

        logger.info("adding table %s" % tablename)

        inject.add_table(tablename, df)
def repop_setup_data_structures(configs_dir, households, persons):
    """
    Setup geographic correspondence (crosswalk), control sets, and incidence tables for repop run.

    A new lowest-level geography control tables should already have been read in by rerunning
    input_pre_processor with a table_list override. The control table contains one row for
    each zone, with columns specifying control field totals for that control

    This step reads in the repop control file, which specifies which control control fields
    in the control table should be used for balancing, along with their importance and the
    recipe (seed table and expression) for determining household incidence for that control.

    Parameters
    ----------
    configs_dir : str
    households: pipeline table
    persons: pipeline table

    Returns
    -------

    """

    seed_geography = setting('seed_geography')
    geographies = setting('geographies')
    low_geography = geographies[-1]

    # replace crosswalk table
    crosswalk_df = build_crosswalk_table()
    pipeline.replace_table('crosswalk', crosswalk_df)

    # replace control_spec
    control_file_name = setting('repop_control_file_name',
                                'repop_controls.csv')
    control_spec = read_control_spec(control_file_name, configs_dir)

    # repop control spec should only specify controls for lowest level geography
    assert control_spec.geography.unique() == [low_geography]

    pipeline.replace_table('control_spec', control_spec)

    # build incidence_table with repop controls and households in repop zones
    # filter households (dropping any not in crosswalk) in order to build incidence_table
    # We DO NOT REPLACE households and persons as we need full tables to synthesize population
    # (There is no problem, however, with overwriting the incidence_table and household_groups
    # because the expand_households step has ALREADY created the expanded_household_ids table
    # for the original simulated population. )

    households_df = households.to_frame()
    persons_df = persons.to_frame()
    households_df, persons_df = filter_households(households_df, persons_df,
                                                  crosswalk_df)
    incidence_table = build_incidence_table(control_spec, households_df,
                                            persons_df, crosswalk_df)
    incidence_table = add_geography_columns(incidence_table, households_df,
                                            crosswalk_df)
    # add sample_weight col to incidence table
    hh_weight_col = setting('household_weight_col')
    incidence_table['sample_weight'] = households_df[hh_weight_col]

    # rebuild control tables with only the low level controls (aggregated at higher levels)
    for g in geographies:
        controls = build_control_table(g, control_spec, crosswalk_df)
        pipeline.replace_table(control_table_name(g), controls)

    if setting('GROUP_BY_INCIDENCE_SIGNATURE'):
        group_incidence_table, household_groups \
            = build_grouped_incidence_table(incidence_table, control_spec, seed_geography)

        pipeline.replace_table('household_groups', household_groups)
        pipeline.replace_table('incidence_table', group_incidence_table)
    else:
        pipeline.replace_table('incidence_table', incidence_table)
def setup_data_structures(settings, configs_dir, households, persons):
    """
    Setup geographic correspondence (crosswalk), control sets, and incidence tables.

    A control tables for target geographies should already have been read in by running
    input_pre_processor. The zone control tables contains one row for each zone, with columns
    specifying control field totals for that control

    This step reads in the global control file, which specifies which control control fields
    in the control table should be used for balancing, along with their importance and the
    recipe (seed table and expression) for determining household incidence for that control.

    If GROUP_BY_INCIDENCE_SIGNATURE setting is enabled, then incidence table rows are
    household group ids and and additional household_groups table is created mapping hh group ids
    to actual hh_ids.

    Parameters
    ----------
    settings: dict
        contents of settings.yaml as dict
    configs_dir: str
    households: pipeline table
    persons: pipeline table

    creates pipeline tables:
        crosswalk
        controls
        geography-specific controls
        incidence_table
        household_groups (if GROUP_BY_INCIDENCE_SIGNATURE setting is enabled)

    modifies tables:
        households
        persons

    """

    seed_geography = setting('seed_geography')

    households_df = households.to_frame()
    persons_df = persons.to_frame()

    crosswalk_df = build_crosswalk_table()
    inject.add_table('crosswalk', crosswalk_df)

    control_spec = read_control_spec(
        setting('control_file_name', 'controls.csv'), configs_dir)
    inject.add_table('control_spec', control_spec)

    geographies = settings['geographies']
    for g in geographies:
        controls = build_control_table(g, control_spec, crosswalk_df)
        inject.add_table(control_table_name(g), controls)

    households_df, persons_df = filter_households(households_df, persons_df,
                                                  crosswalk_df)
    pipeline.replace_table('households', households_df)
    pipeline.replace_table('persons', persons_df)

    incidence_table = \
        build_incidence_table(control_spec, households_df, persons_df, crosswalk_df)

    incidence_table = add_geography_columns(incidence_table, households_df,
                                            crosswalk_df)

    # add sample_weight col to incidence table
    hh_weight_col = setting('household_weight_col')
    incidence_table['sample_weight'] = households_df[hh_weight_col]

    if setting('GROUP_BY_INCIDENCE_SIGNATURE'):
        group_incidence_table, household_groups \
            = build_grouped_incidence_table(incidence_table, control_spec, seed_geography)

        inject.add_table('household_groups', household_groups)
        inject.add_table('incidence_table', group_incidence_table)
    else:
        inject.add_table('incidence_table', incidence_table)
def write_synthetic_population(expanded_household_ids, households, persons,
                               output_dir):
    """
    Write synthetic households and persons tables to output dir as csv files.
    The settings file allows specification of output file names, household_id column name,
    and seed data attribute columns to include in output files.

    Parameters
    ----------
    expanded_household_ids : pipeline table
    households : pipeline table
    persons : pipeline table
    output_dir : str

    Returns
    -------

    """

    expanded_household_ids = expanded_household_ids.to_frame()
    households = households.to_frame()
    persons = persons.to_frame()

    SETTINGS_NAME = 'output_synthetic_population'
    synthetic_tables_settings = setting(SETTINGS_NAME)
    if synthetic_tables_settings is None:
        raise RuntimeError("'%s' not found in settings" % SETTINGS_NAME)

    hh_col = setting('household_id_col')
    synthetic_hh_col = synthetic_tables_settings.get('household_id', 'HH_ID')

    # - assign household_ids to synthetic population
    expanded_household_ids.reset_index(drop=True, inplace=True)
    expanded_household_ids[
        'synthetic_hh_id'] = expanded_household_ids.index + 1

    # - households

    TABLE_NAME = 'households'
    options = synthetic_tables_settings.get(TABLE_NAME, None)
    if options is None:
        raise RuntimeError("Options for '%s' not found in '%s' in settings" %
                           (TABLE_NAME, SETTINGS_NAME))

    seed_columns = options.get('columns')

    if synthetic_hh_col.lower() in [c.lower() for c in seed_columns]:
        raise RuntimeError(
            "synthetic household_id column '%s' also appears in seed column list"
            % synthetic_hh_col)

    df = merge_seed_data(expanded_household_ids,
                         households,
                         seed_columns=seed_columns,
                         trace_label=TABLE_NAME)

    # synthetic_hh_id is index
    df.rename(columns={'synthetic_hh_id': synthetic_hh_col}, inplace=True)
    df.set_index(synthetic_hh_col, inplace=True)

    filename = options.get('filename', '%s.csv' % TABLE_NAME)
    file_path = os.path.join(output_dir, filename)
    df.to_csv(file_path, index=True)

    # - persons

    TABLE_NAME = 'persons'
    options = synthetic_tables_settings.get(TABLE_NAME, None)
    if options is None:
        raise RuntimeError("Options for '%s' not found in '%s' in settings" %
                           (TABLE_NAME, SETTINGS_NAME))

    seed_columns = options.get('columns')
    if synthetic_hh_col.lower() in [c.lower() for c in seed_columns]:
        raise RuntimeError(
            "synthetic household_id column '%s' also appears in seed column list"
            % synthetic_hh_col)

    df = merge_seed_data(expanded_household_ids,
                         persons,
                         seed_columns=seed_columns,
                         trace_label=TABLE_NAME)

    # FIXME drop or rename old seed hh_id column?
    df.rename(columns={'synthetic_hh_id': synthetic_hh_col}, inplace=True)

    filename = options.get('filename', '%s.csv' % TABLE_NAME)
    file_path = os.path.join(output_dir, filename)
    df.to_csv(file_path, index=False)
Beispiel #17
0
def build_control_table(geo, control_spec, crosswalk_df):

    # control_geographies is list with target geography and the geographies beneath it
    control_geographies = setting('geographies')
    assert geo in control_geographies
    control_geographies = control_geographies[control_geographies.index(geo):]

    # only want controls for control_geographies
    control_spec = control_spec[control_spec['geography'].isin(
        control_geographies)]
    controls_list = []

    # for each geography at or beneath target geography
    for g in control_geographies:

        # control spec rows for this geography
        spec = control_spec[control_spec['geography'] == g]

        # are there any controls specified for this geography? (e.g. seed has none)
        if len(spec.index) == 0:
            continue

        # control_data for this geography
        control_data_df = get_control_data_table(g)

        control_data_columns = [geo] + spec.control_field.tolist()

        if g == geo:
            # for top level, we expect geo_col, and need to group and sum
            assert geo in control_data_df.columns
            controls = control_data_df[control_data_columns]
            controls.set_index(geo, inplace=True)
        else:
            # aggregate sub geography control totals to the target geo level

            # add geo_col to control_data table
            if geo not in control_data_df.columns:
                # create series mapping sub_geo id to geo id
                sub_to_geog = crosswalk_df[[g, geo]].groupby(
                    g, as_index=True).min()[geo]

                control_data_df[geo] = control_data_df[g].map(sub_to_geog)

            # aggregate (sum) controls to geo level
            controls = control_data_df[control_data_columns].groupby(
                geo, as_index=True).sum()

        controls_list.append(controls)

    # concat geography columns
    controls = pd.concat(controls_list, axis=1)

    # rename columns from seed_col to target
    columns = {
        c: t
        for c, t in zip(control_spec.control_field, control_spec.target)
    }
    controls.rename(columns=columns, inplace=True)

    # reorder columns to match order of control_spec rows
    controls = controls[control_spec.target]

    return controls
Beispiel #18
0
def summarize(crosswalk, incidence_table, control_spec):
    """
    Write aggregate summary files of controls and weights for all geographic levels to output dir

    Parameters
    ----------
    crosswalk : pipeline table
    incidence_table : pipeline table
    control_spec : pipeline table

    Returns
    -------

    """

    crosswalk_df = crosswalk.to_frame()
    incidence_df = incidence_table.to_frame()

    geographies = setting('geographies')
    seed_geography = setting('seed_geography')
    meta_geography = geographies[0]
    sub_geographies = geographies[geographies.index(seed_geography) + 1:]
    household_id_col = setting('household_id_col')

    meta_ids = crosswalk_df[meta_geography].unique()
    for meta_id in meta_ids:
        meta_summary_df = \
            meta_summary(incidence_df, control_spec, meta_geography, meta_id, sub_geographies)
        out_table('%s_%s' % (meta_geography, meta_id), meta_summary_df)

    hh_weights_summary = pd.DataFrame(index=incidence_df.index)

    # add seed level summaries
    seed_weights_df = get_weight_table(seed_geography)
    hh_weights_summary['%s_balanced_weight' %
                       seed_geography] = seed_weights_df['balanced_weight']
    hh_weights_summary['%s_integer_weight' %
                       seed_geography] = seed_weights_df['integer_weight']

    for geography in sub_geographies:

        weights_df = get_weight_table(geography)

        if weights_df is None:
            continue

        hh_weight_cols = [
            household_id_col, 'balanced_weight', 'integer_weight'
        ]
        hh_weights = weights_df[hh_weight_cols].groupby([household_id_col
                                                         ]).sum()
        hh_weights_summary['%s_balanced_weight' %
                           geography] = hh_weights['balanced_weight']
        hh_weights_summary['%s_integer_weight' %
                           geography] = hh_weights['integer_weight']

        # aggregate to seed level
        hh_id_col = incidence_df.index.name
        aggegrate_weights = weights_df.groupby([seed_geography, hh_id_col],
                                               as_index=False).sum()
        aggegrate_weights.set_index(hh_id_col, inplace=True)

        aggegrate_weights = \
            aggegrate_weights[[seed_geography, 'balanced_weight', 'integer_weight']]
        aggegrate_weights['sample_weight'] = \
            incidence_df['sample_weight']
        aggegrate_weights['%s_preliminary_balanced_weight' % seed_geography] = \
            seed_weights_df['preliminary_balanced_weight']
        aggegrate_weights['%s_balanced_weight' % seed_geography] = \
            seed_weights_df['balanced_weight']
        aggegrate_weights['%s_integer_weight' % seed_geography] = \
            seed_weights_df['integer_weight']

        out_table('%s_aggregate' % (geography, ), aggegrate_weights)

        df = summarize_geography(seed_geography, 'integer_weight',
                                 crosswalk_df, weights_df, incidence_df)
        out_table('%s_%s' % (
            geography,
            seed_geography,
        ), df)

        df = summarize_geography(geography, 'integer_weight', crosswalk_df,
                                 weights_df, incidence_df)
        out_table('%s' % (geography, ), df)

    out_table('hh_weights', hh_weights_summary)
Beispiel #19
0
def write_tables(output_dir):
    """
    Write pipeline tables as csv files (in output directory) as specified by output_tables list
    in settings file.

    Pipeline tables are intermediate computational tables, not to be confused with the
    synthetic population tables written by the write_synthetic_population step.

    'output_tables' can specify either a list of output tables to include or to skip
    if no output_tables list is specified, then no checkpointed tables will be written

    Intermediate tables likely to be of particular interest or utility are the controls and weights
    tables for the various geographies. For example, if one of your geographies is TRACT, then:
    TRACT_controls has control totals for every TRACT (and aggregated subzone) controls.
    TRACT_weights has balanced_weight and integer_weight for every TRACT.

    To write all output tables EXCEPT the households and persons tables:

    ::

      output_tables:
        action: skip
        tables:
          - households
          - persons

    To write ONLY the expanded_household_ids table:

    ::

      output_tables:
        action: include
        tables:
           - expanded_household_ids

    Parameters
    ----------
    output_dir: str

    """

    output_tables_settings_name = 'output_tables'

    output_tables_settings = setting(output_tables_settings_name)

    output_tables_list = pipeline.checkpointed_tables()

    if output_tables_settings is None:
        logger.info("No output_tables specified in settings file. Nothing to write.")
        return

    action = output_tables_settings.get('action')
    tables = output_tables_settings.get('tables')

    if action not in ['include', 'skip']:
        raise RuntimeError("expected %s action '%s' to be either 'include' or 'skip'" %
                           (output_tables_settings_name, action))

    if action == 'include':
        output_tables_list = tables
    elif action == 'skip':
        output_tables_list = [t for t in output_tables_list if t not in tables]

    logger.debug("output_tables_list: %s" % str(output_tables_list))
    # should provide option to also write checkpoints?
    # output_tables_list.append("checkpoints.csv")

    # columns: geography, id, variable, control, result, diff
    summary_melt_df = pd.DataFrame()

    for table_name in output_tables_list:
        table = inject.get_table(table_name, None)

        if table is None:
            logger.warn("Skipping '%s': Table not found." % table_name)
            continue

        df = table.to_frame()
        file_name = "%s.csv" % table_name
        logger.info("writing output file %s" % file_name)
        file_path = os.path.join(output_dir, file_name)
        write_index = df.index.name is not None
        df.to_csv(file_path, index=write_index)

        try:
            # create the melt
            # find the control variables
            control_vars = []
            for column in list(df.columns.values):
                if column[-8:] == "_control": control_vars.append(column[:-8])
            logger.debug("control variables for melt %s" % str(control_vars))

            control_col_names = list("%s_control" % cv for cv in control_vars)
            result_col_names  = list("%s_result"  % cv for cv in control_vars)
            diff_col_names    = list("%s_diff"    % cv for cv in control_vars)

            control_melt_df = df.melt(id_vars=["geography","id"], value_vars=control_col_names, value_name="control").replace(to_replace=dict(zip(control_col_names, control_vars)) )
            result_melt_df  = df.melt(id_vars=["geography","id"], value_vars=result_col_names,  value_name="result" ).replace(to_replace=dict(zip(result_col_names,  control_vars)) )
            diff_melt_df    = df.melt(id_vars=["geography","id"], value_vars=diff_col_names,    value_name="diff"   ).replace(to_replace=dict(zip(diff_col_names,    control_vars)) )

            melt_df = pd.merge(left=control_melt_df, right=result_melt_df, how="left", on=["geography","id","variable"])
            melt_df = pd.merge(left=melt_df,         right=diff_melt_df,   how="left", on=["geography","id","variable"])
            summary_melt_df = summary_melt_df.append(melt_df)

        except:
            # if something doesn't work, it's ok
            pass

    if len(summary_melt_df) > 0:
        file_name = "summary_melt.csv"
        logger.info("writing output file %s" % file_name)
        file_path = os.path.join(output_dir, file_name)
        write_index = df.index.name is not None
        summary_melt_df.to_csv(file_path, index=write_index)
Beispiel #20
0
def final_seed_balancing(settings, crosswalk, control_spec, incidence_table):
    """
    Balance the household weights for each of the seed geographies (independently)
    using the seed level controls and the aggregated sub-zone controls totals.

    Create the seed_weights table with one row per household and columns contaiing
    household_id, seed geography (e.g. PUMA), and float preliminary_balanced_weights

    Adds column balanced_weight  to the seed_weights table

    Parameters
    ----------
    settings : dict (settings.yaml as dict)
    crosswalk : pipeline table
    control_spec : pipeline table
    incidence_table : pipeline table

    Returns
    -------

    """

    crosswalk_df = crosswalk.to_frame()
    incidence_df = incidence_table.to_frame()
    control_spec = control_spec.to_frame()

    seed_geography = settings.get('seed_geography')
    seed_weight_table_name = weight_table_name(seed_geography)

    # if there are no meta controls, then balanced_weight is simply preliminary_balanced_weight
    geographies = settings['geographies']
    if not (control_spec.geography == geographies[0]).any():
        logger.warning(
            "no need for final_seed_balancing because no meta controls")
        seed_weights_df = get_weight_table(seed_geography)
        if 'balanced_weight' not in seed_weights_df:
            final_seed_weights = seed_weights_df['preliminary_balanced_weight']
            inject.add_column(seed_weight_table_name, 'balanced_weight',
                              final_seed_weights)
        return

    # we use all control_spec rows, so no need to filter on geography as for initial_seed_balancing
    seed_controls_df = get_control_table(seed_geography)
    assert (seed_controls_df.columns == control_spec.target).all()

    # determine master_control_index if specified in settings
    total_hh_control_col = setting('total_hh_control')

    max_expansion_factor = settings.get('max_expansion_factor', None)

    relaxation_factors = pd.DataFrame(index=seed_controls_df.columns.tolist())

    # run balancer for each seed geography
    weight_list = []

    seed_ids = crosswalk_df[seed_geography].unique()
    for seed_id in seed_ids:

        logger.info("final_seed_balancing seed id %s" % seed_id)

        seed_incidence_df = incidence_df[incidence_df[seed_geography] ==
                                         seed_id]

        status, weights_df, controls_df = do_balancing(
            control_spec=control_spec,
            total_hh_control_col=total_hh_control_col,
            max_expansion_factor=max_expansion_factor,
            incidence_df=seed_incidence_df,
            control_totals=seed_controls_df.loc[seed_id],
            initial_weights=seed_incidence_df['sample_weight'])

        logger.info("seed_balancer status: %s" % status)
        if not status['converged']:
            raise RuntimeError(
                "final_seed_balancing for seed_id %s did not converge" %
                seed_id)

        weight_list.append(weights_df['final'])

        relaxation_factors[seed_id] = controls_df['relaxation_factor']

    # bulk concat all seed level results
    final_seed_weights = pd.concat(weight_list)

    inject.add_column(seed_weight_table_name, 'balanced_weight',
                      final_seed_weights)
Beispiel #21
0
def initial_seed_balancing(settings, crosswalk, control_spec, incidence_table):
    """
    Balance the household weights for each of the seed geographies (independently)
    using the seed level controls and the aggregated sub-zone controls totals.

    Create the seed_weights table with one row per household and columns contaiing
    household_id, seed geography (e.g. PUMA), and float preliminary_balanced_weights

    Adds seed_weights table to pipeline named <seed_geography>_weights (e.g. PUMA_weights):

    +--------+------+-----------------------------+-------+
    | index  | PUMA | preliminary_balanced_weight | hh_id |
    | hh_id  |      |                             |       |
    +========+======+=============================+=======+
    | 0      | 600  |                   0.313555  |    0  |
    | 1      | 601  |                   0.627110  |    1  |
    | 2      | 602  |                   0.313555  |    2  |
    | ...    |      |                             |       |
    +--------+------+-----------------------------+-------+

    Parameters
    ----------
    settings : dict (settings.yaml as dict)
    crosswalk : pipeline table
    control_spec : pipeline table
    incidence_table : pipeline table

    """
    crosswalk_df = crosswalk.to_frame()
    incidence_df = incidence_table.to_frame()
    control_spec = control_spec.to_frame()

    seed_geography = settings.get('seed_geography')
    seed_controls_df = get_control_table(seed_geography)

    # only want control_spec rows for seed geography and below
    geographies = settings['geographies']
    seed_geographies = geographies[geographies.index(seed_geography):]
    seed_control_spec = control_spec[control_spec['geography'].isin(
        seed_geographies)]

    # determine master_control_index if specified in settings
    total_hh_control_col = setting('total_hh_control')

    max_expansion_factor = settings.get('max_expansion_factor', None)
    min_expansion_factor = settings.get('min_expansion_factor', None)

    # run balancer for each seed geography
    weight_list = []
    sample_weight_list = []

    seed_ids = crosswalk_df[seed_geography].unique()
    for seed_id in seed_ids:

        logger.info("initial_seed_balancing seed id %s" % seed_id)

        seed_incidence_df = incidence_df[incidence_df[seed_geography] ==
                                         seed_id]

        status, weights_df, controls_df = do_balancing(
            control_spec=seed_control_spec,
            total_hh_control_col=total_hh_control_col,
            max_expansion_factor=max_expansion_factor,
            min_expansion_factor=min_expansion_factor,
            incidence_df=seed_incidence_df,
            control_totals=seed_controls_df.loc[seed_id],
            initial_weights=seed_incidence_df['sample_weight'])

        logger.info("seed_balancer status: %s" % status)
        if not status['converged']:
            raise RuntimeError(
                "initial_seed_balancing for seed_id %s did not converge" %
                seed_id)

        balanced_weights = weights_df['final']

        logger.info("Total balanced weights for seed %s = %s" %
                    (seed_id, balanced_weights.sum()))

        weight_list.append(balanced_weights)
        sample_weight_list.append(seed_incidence_df['sample_weight'])

    # bulk concat all seed level results
    weights = pd.concat(weight_list)
    sample_weights = pd.concat(sample_weight_list)

    # build canonical weights table
    seed_weights_df = incidence_df[[seed_geography]].copy()
    seed_weights_df['preliminary_balanced_weight'] = weights

    seed_weights_df['sample_weight'] = sample_weights

    # copy household_id_col index to named column
    seed_weights_df[setting('household_id_col')] = seed_weights_df.index

    # this is just a convenience if there are no meta controls
    if inject.get_step_arg('final', default=False):
        seed_weights_df['balanced_weight'] = seed_weights_df[
            'preliminary_balanced_weight']

    repop = inject.get_step_arg('repop', default=False)
    inject.add_table(weight_table_name(seed_geography),
                     seed_weights_df,
                     replace=repop)
Beispiel #22
0
def repop_balancing(settings, crosswalk, control_spec, incidence_table):
    """

    Balance and integerize all zones at a lowest geographic level.


    Creates a weight table for the repop zones target geography
    with float 'balanced_weight' and 'integer_weight' columns.

    Parameters
    ----------
    settings : dict (settings.yaml as dict)
    crosswalk : pipeline table
    control_spec: pipeline table
    incidence_table : pipeline table
    """

    crosswalk_df = crosswalk.to_frame()
    incidence_df = incidence_table.to_frame()
    control_spec = control_spec.to_frame()

    geographies = settings['geographies']
    low_geography = geographies[-1]

    seed_geography = settings.get('seed_geography')
    seed_controls_df = get_control_table(seed_geography)

    all_seed_weights_df = get_weight_table(seed_geography)
    assert all_seed_weights_df is not None

    # only want control_spec rows for low_geography
    low_control_spec = control_spec[control_spec['geography'] == low_geography]
    low_controls_df = get_control_table(low_geography)

    household_id_col = setting('household_id_col')
    total_hh_control_col = setting('total_hh_control')

    max_expansion_factor = settings.get('max_expansion_factor', None)
    min_expansion_factor = settings.get('min_expansion_factor', None)

    # run balancer for each low geography
    low_weight_list = []

    seed_ids = crosswalk_df[seed_geography].unique()
    for seed_id in seed_ids:

        logger.info("initial_seed_balancing seed id %s" % seed_id)

        seed_incidence_df = incidence_df[incidence_df[seed_geography] ==
                                         seed_id]
        seed_crosswalk_df = crosswalk_df[crosswalk_df[seed_geography] ==
                                         seed_id]

        # initial seed weights in series indexed by hh id
        seed_weights_df = all_seed_weights_df[
            all_seed_weights_df[seed_geography] == seed_id]
        seed_weights_df = seed_weights_df.set_index(household_id_col)

        # number of hh in seed zone (for scaling low zone weights)
        seed_zone_hh_count = seed_controls_df[total_hh_control_col].loc[
            seed_id]

        low_ids = seed_crosswalk_df[low_geography].unique()
        for low_id in low_ids:

            trace_label = "%s_%s_%s_%s" % (seed_geography, seed_id,
                                           low_geography, low_id)
            logger.info("balance and integerize %s" % trace_label)

            # weights table for this zone with household_id index and low_geography column
            zone_weights_df = pd.DataFrame(index=seed_weights_df.index)
            zone_weights_df[low_geography] = low_id

            # scale seed weights by relative hh counts
            # it doesn't makes sense to repop balance with integer weights
            low_zone_hh_count = low_controls_df[total_hh_control_col].loc[
                low_id]
            scaling_factor = float(low_zone_hh_count) / seed_zone_hh_count
            initial_weights = seed_weights_df[
                'balanced_weight'] * scaling_factor

            # - balance
            status, weights_df, controls_df = do_balancing(
                control_spec=low_control_spec,
                total_hh_control_col=total_hh_control_col,
                max_expansion_factor=max_expansion_factor,
                min_expansion_factor=min_expansion_factor,
                incidence_df=seed_incidence_df,
                control_totals=low_controls_df.loc[low_id],
                initial_weights=initial_weights)

            logger.info("repop_balancing balancing %s status: %s" %
                        (trace_label, status))
            if not status['converged']:
                raise RuntimeError("repop_balancing for %s did not converge" %
                                   trace_label)

            zone_weights_df['balanced_weight'] = weights_df['final']

            # - integerize
            integer_weights, status = do_integerizing(
                trace_label=trace_label,
                control_spec=control_spec,
                control_totals=low_controls_df.loc[low_id],
                incidence_table=seed_incidence_df,
                float_weights=weights_df['final'],
                total_hh_control_col=total_hh_control_col)

            logger.info("repop_balancing integerizing status: %s" % status)

            zone_weights_df['integer_weight'] = integer_weights

            logger.info(
                "Total balanced weights for %s = %s" %
                (trace_label, zone_weights_df['balanced_weight'].sum()))
            logger.info("Total integerized weights for %s = %s" %
                        (trace_label, zone_weights_df['integer_weight'].sum()))

            low_weight_list.append(zone_weights_df)

    # concat all low geography zone level results
    low_weights_df = pd.concat(low_weight_list).reset_index()

    # add higher level geography id columns to facilitate summaries
    crosswalk_df = crosswalk_df.set_index(low_geography)\
        .loc[low_weights_df[low_geography]]\
        .reset_index(drop=True)
    low_weights_df = pd.concat([low_weights_df, crosswalk_df], axis=1)

    inject.add_table(weight_table_name(low_geography),
                     low_weights_df,
                     replace=True)
    inject.add_table(weight_table_name(low_geography, sparse=True),
                     low_weights_df[low_weights_df['integer_weight'] > 0],
                     replace=True)
Beispiel #23
0
def expand_households():
    """
    Create a complete expanded synthetic household list with their assigned geographic zone ids.

    This is the skeleton synthetic household id list with no household or person attributes,
    one row per household, with geography columns and seed household table household_id.

    Creates pipeline table expanded_household_ids
    """

    if setting('NO_INTEGERIZATION_EVER', False):
        logger.warning("skipping expand_households: NO_INTEGERIZATION_EVER")
        inject.add_table('expanded_household_ids', pd.DataFrame())
        return

    geographies = setting('geographies')
    household_id_col = setting('household_id_col')

    low_geography = geographies[-1]

    # only one we really need is low_geography
    seed_geography = setting('seed_geography')
    geography_cols = geographies[geographies.index(seed_geography):]

    weights = get_weight_table(low_geography, sparse=True)
    weights = weights[geography_cols + [household_id_col, 'integer_weight']]

    # - expand weights table by integer_weight, so there is one row per desired hh
    weight_cols = weights.columns.values
    weights_np = np.repeat(weights.values,
                           weights.integer_weight.values,
                           axis=0)
    expanded_weights = pd.DataFrame(data=weights_np, columns=weight_cols)

    if setting('GROUP_BY_INCIDENCE_SIGNATURE'):

        # the household_id_col is really the group_id
        expanded_weights.rename(columns={household_id_col: 'group_id'},
                                inplace=True)

        # the original incidence table with one row per hh, with index hh_id
        household_groups = pipeline.get_table('household_groups')
        household_groups = household_groups[[
            household_id_col, 'group_id', 'sample_weight'
        ]]

        # for each group, lists of hh_ids and their sample_weights (as relative probabiliities)
        # [ [ [<group_0_hh_id_list>], [<group_0_hh_prob_list>] ],
        #   [ [<group_1_hh_id_list>], [<group_1_hh_prob_list>] ], ... ]
        HH_IDS = 0
        HH_PROBS = 1
        grouper = household_groups.groupby('group_id')
        group_hh_probs = [0] * len(grouper)
        for group_id, df in grouper:
            hh_ids = list(df[household_id_col])
            probs = list(df.sample_weight / df.sample_weight.sum())
            group_hh_probs[group_id] = [hh_ids, probs]

        # FIXME - should sample without replacement?
        # now make a hh_id choice for each group_id in expanded_weights
        def chooser(group_id):
            hh_ids = group_hh_probs[group_id][HH_IDS]
            hh_probs = group_hh_probs[group_id][HH_PROBS]
            return np.random.choice(hh_ids, p=hh_probs)
        expanded_weights[household_id_col] = \
            expanded_weights.group_id.apply(chooser, convert_dtype=True,)

        # FIXME - omit in production?
        del expanded_weights['group_id']
        del expanded_weights['integer_weight']

    append = inject.get_step_arg('append', False)
    replace = inject.get_step_arg('replace', False)
    assert not (
        append and
        replace), "can't specify both append and replace for expand_households"

    if append or replace:
        t = inject.get_table('expanded_household_ids').to_frame()
        prev_hhs = len(t.index)
        added_hhs = len(expanded_weights.index)

        if replace:
            # FIXME - should really get from crosswalk table?
            low_ids_to_replace = expanded_weights[low_geography].unique()
            t = t[~t[low_geography].isin(low_ids_to_replace)]

        expanded_weights = pd.concat([t, expanded_weights], ignore_index=True)

        dropped_hhs = prev_hhs - len(t.index)
        final_hhs = len(expanded_weights.index)
        op = 'append' if append else 'replace'
        logger.info(
            "expand_households op: %s prev hh count %s dropped %s added %s final %s"
            % (op, prev_hhs, dropped_hhs, added_hhs, final_hhs))

    repop = inject.get_step_arg('repop', default=False)
    inject.add_table('expanded_household_ids', expanded_weights, replace=repop)
def integerize_final_seed_weights(settings, crosswalk, control_spec,
                                  incidence_table):
    """
    Final balancing for each seed (puma) zone with aggregated low and mid-level controls and
    distributed meta-level controls.

    Adds integer_weight column to seed-level weight table

    Parameters
    ----------
    settings : dict (settings.yaml as dict)
    crosswalk : pipeline table
    control_spec : pipeline table
    incidence_table : pipeline table

    Returns
    -------

    """

    if setting('NO_INTEGERIZATION_EVER', False):
        logger.warning(
            "skipping integerize_final_seed_weights: NO_INTEGERIZATION_EVER")
        return

    crosswalk_df = crosswalk.to_frame()
    incidence_df = incidence_table.to_frame()
    control_spec = control_spec.to_frame()

    seed_geography = settings.get('seed_geography')
    seed_controls_df = get_control_table(seed_geography)

    seed_weights_df = get_weight_table(seed_geography)

    # FIXME - I assume we want to integerize using meta controls too?
    control_cols = control_spec.target
    assert (seed_controls_df.columns == control_cols).all()

    # determine master_control_index if specified in settings
    total_hh_control_col = setting('total_hh_control')

    # run balancer for each seed geography
    weight_list = []

    seed_ids = crosswalk_df[seed_geography].unique()
    for seed_id in seed_ids:

        logger.info("integerize_final_seed_weights seed id %s" % seed_id)

        # slice incidence rows for this seed geography
        seed_incidence = incidence_df[incidence_df[seed_geography] == seed_id]

        balanced_seed_weights = \
            seed_weights_df.loc[seed_weights_df[seed_geography] == seed_id, 'balanced_weight']

        trace_label = "%s_%s" % (seed_geography, seed_id)

        integer_weights, status = do_integerizing(
            trace_label=trace_label,
            control_spec=control_spec,
            control_totals=seed_controls_df.loc[seed_id],
            incidence_table=seed_incidence[control_cols],
            float_weights=balanced_seed_weights,
            total_hh_control_col=total_hh_control_col)

        weight_list.append(integer_weights)

    # bulk concat all seed level results
    integer_seed_weights = pd.concat(weight_list)

    inject.add_column(weight_table_name(seed_geography), 'integer_weight',
                      integer_seed_weights)
Beispiel #25
0
# Add (and handle) 'standard' activitysim arguments:
#     --config : specify path to config_dir
#     --output : specify path to output_dir
#     --data   : specify path to data_dir
#     --models : specify run_list name
#     --resume : resume_after
handle_standard_args()

tracing.config_logger()

t0 = print_elapsed_time()

logger = logging.getLogger('populationsim')

logger.info("GROUP_BY_INCIDENCE_SIGNATURE: %s" %
            setting('GROUP_BY_INCIDENCE_SIGNATURE'))
logger.info("INTEGERIZE_WITH_BACKSTOPPED_CONTROLS: %s" %
            setting('INTEGERIZE_WITH_BACKSTOPPED_CONTROLS'))
logger.info("SUB_BALANCE_WITH_FLOAT_SEED_WEIGHTS: %s" %
            setting('SUB_BALANCE_WITH_FLOAT_SEED_WEIGHTS'))
logger.info("meta_control_data: %s" % setting('meta_control_data'))
logger.info("control_file_name: %s" % setting('control_file_name'))

logger.info("USE_CVXPY: %s" % lp.use_cvxpy())
logger.info("USE_SIMUL_INTEGERIZER: %s" %
            multi_integerizer.use_simul_integerizer())

# get the run list (name was possibly specified on the command line with the -m option)
run_list_name = inject.get_injectable('run_list_name', 'run_list')

# run list from settings file is dict with list of 'steps' and optional 'resume_after'
Beispiel #26
0
def sub_balancing(settings, crosswalk, control_spec, incidence_table):

    # geography is an injected model step arg
    geography = inject.get_step_arg('geography')

    crosswalk_df = crosswalk.to_frame()
    incidence_df = incidence_table.to_frame()
    control_spec = control_spec.to_frame()

    geographies = settings.get('geographies')
    seed_geography = settings.get('seed_geography')
    meta_geography = geographies[0]
    parent_geography = geographies[geographies.index(geography) - 1]

    sub_geographies = geographies[geographies.index(geography):]
    parent_geographies = geographies[:geographies.index(geography)]

    total_hh_control_col = settings.get('total_hh_control')

    sub_controls_df = get_control_table(geography)

    weights_df = get_weight_table(parent_geography)
    assert weights_df is not None

    integer_weights_list = []

    seed_ids = crosswalk_df[seed_geography].unique()
    for seed_id in seed_ids:

        seed_incidence_df = incidence_df[incidence_df[seed_geography] ==
                                         seed_id]
        seed_crosswalk_df = crosswalk_df[crosswalk_df[seed_geography] ==
                                         seed_id]

        assert len(seed_crosswalk_df[meta_geography].unique()) == 1

        parent_ids = seed_crosswalk_df[parent_geography].unique()

        for parent_id in parent_ids:

            logger.info("balancing seed %s, %s %s" %
                        (seed_id, parent_geography, parent_id))

            initial_weights = weights_df[weights_df[parent_geography] ==
                                         parent_id]
            initial_weights = initial_weights.set_index(
                settings.get('household_id_col'))

            # using balanced_weight slows down simul and doesn't improve results
            # (float seeds means no zero-weight households to drop)
            if setting('SUB_BALANCE_WITH_FLOAT_SEED_WEIGHTS', True):
                initial_weights = initial_weights['balanced_weight']
            else:
                initial_weights = initial_weights['integer_weight']

            assert len(initial_weights.index) == len(seed_incidence_df.index)

            zone_weights_df = balance_and_integerize(
                incidence_df=seed_incidence_df,
                parent_weights=initial_weights,
                sub_controls_df=sub_controls_df,
                control_spec=control_spec,
                total_hh_control_col=total_hh_control_col,
                parent_geography=parent_geography,
                parent_id=parent_id,
                sub_geographies=sub_geographies,
                crosswalk_df=seed_crosswalk_df)

            # add higher level geography id columns to facilitate summaries
            parent_geography_ids = \
                crosswalk_df.loc[crosswalk_df[parent_geography] == parent_id, parent_geographies]\
                .max(axis=0)
            for z in parent_geography_ids.index:
                zone_weights_df[z] = parent_geography_ids[z]

            integer_weights_list.append(zone_weights_df)

    integer_weights_df = pd.concat(integer_weights_list)

    inject.add_table(weight_table_name(geography), integer_weights_df)
    inject.add_table(
        weight_table_name(geography, sparse=True),
        integer_weights_df[integer_weights_df['integer_weight'] > 0])

    if 'trace_geography' in settings and geography in settings[
            'trace_geography']:
        trace_geography_id = settings.get('trace_geography')[geography]
        df = integer_weights_df[integer_weights_df[geography] ==
                                trace_geography_id]
        inject.add_table('trace_%s' % weight_table_name(geography), df)
Beispiel #27
0
def sub_balancing(settings, crosswalk, control_spec, incidence_table):
    """
    Simul-balance and integerize all zones at a specified geographic level
    in groups by parent zone.

    For instance, if the 'geography' step arg is 'TRACT' and the parent geography is 'SEED',
    then for each seed zone, we simul-balance the TRACTS it contains.

    Creates a weight table for the target geography
    with float 'balanced_weight' and 'integer_weight' columns.

    Parameters
    ----------
    settings : dict (settings.yaml as dict)
    crosswalk : pipeline table
    control_spec : pipeline table
    incidence_table : pipeline table

    Returns
    -------

    """

    # geography is an injected model step arg
    geography = inject.get_step_arg('geography')

    crosswalk_df = crosswalk.to_frame()
    incidence_df = incidence_table.to_frame()
    control_spec = control_spec.to_frame()

    geographies = settings.get('geographies')
    seed_geography = settings.get('seed_geography')
    meta_geography = geographies[0]
    parent_geography = geographies[geographies.index(geography) - 1]

    sub_geographies = geographies[geographies.index(geography):]
    parent_geographies = geographies[:geographies.index(geography)]

    total_hh_control_col = setting('total_hh_control')

    parent_controls_df = get_control_table(parent_geography)
    sub_controls_df = get_control_table(geography)

    weights_df = get_weight_table(parent_geography)
    assert weights_df is not None

    integer_weights_list = []

    # the incidence table is siloed by seed geography, se we handle each seed zone in turn
    seed_ids = crosswalk_df[seed_geography].unique()
    for seed_id in seed_ids:

        # slice incidence and crosswalk tables for this seed zone
        seed_incidence_df = incidence_df[incidence_df[seed_geography] ==
                                         seed_id]
        seed_crosswalk_df = crosswalk_df[crosswalk_df[seed_geography] ==
                                         seed_id]

        assert len(seed_crosswalk_df[meta_geography].unique()) == 1

        # list of unique parent zone ids in this seed zone
        # (there will be just one if parent geo is seed)
        parent_ids = seed_crosswalk_df[parent_geography].unique()
        # only want ones for which there are (non-zero) controls
        parent_ids = parent_controls_df.index.intersection(parent_ids)

        for parent_id in parent_ids:

            logger.info("balancing seed %s, %s %s" %
                        (seed_id, parent_geography, parent_id))

            initial_weights = weights_df[weights_df[parent_geography] ==
                                         parent_id]
            initial_weights = initial_weights.set_index(
                settings.get('household_id_col'))

            # using balanced_weight slows down simul and doesn't improve results
            # (float seeds means no zero-weight households to drop)
            if setting('SUB_BALANCE_WITH_FLOAT_SEED_WEIGHTS', True):
                initial_weights = initial_weights['balanced_weight']
            else:
                initial_weights = initial_weights['integer_weight']

            assert len(initial_weights.index) == len(seed_incidence_df.index)

            zone_weights_df = balance_and_integerize(
                incidence_df=seed_incidence_df,
                parent_weights=initial_weights,
                sub_controls_df=sub_controls_df,
                control_spec=control_spec,
                total_hh_control_col=total_hh_control_col,
                parent_geography=parent_geography,
                parent_id=parent_id,
                sub_geographies=sub_geographies,
                crosswalk_df=seed_crosswalk_df)

            # add higher level geography id columns to facilitate summaries
            parent_geography_ids = \
                crosswalk_df.loc[crosswalk_df[parent_geography] == parent_id, parent_geographies]\
                .max(axis=0)
            for z in parent_geography_ids.index:
                zone_weights_df[z] = parent_geography_ids[z]

            integer_weights_list.append(zone_weights_df)

    integer_weights_df = pd.concat(integer_weights_list)

    inject.add_table(weight_table_name(geography), integer_weights_df)
    inject.add_table(
        weight_table_name(geography, sparse=True),
        integer_weights_df[integer_weights_df['integer_weight'] > 0])

    if 'trace_geography' in settings and geography in settings[
            'trace_geography']:
        trace_geography_id = settings.get('trace_geography')[geography]
        df = integer_weights_df[integer_weights_df[geography] ==
                                trace_geography_id]
        inject.add_table('trace_%s' % weight_table_name(geography), df)
def initial_seed_balancing(settings, crosswalk, control_spec, incidence_table):

    crosswalk_df = crosswalk.to_frame()
    incidence_df = incidence_table.to_frame()
    control_spec = control_spec.to_frame()

    seed_geography = settings.get('seed_geography')
    seed_controls_df = get_control_table(seed_geography)

    # only want control_spec rows for sub_geographies
    geographies = settings['geographies']
    sub_geographies = geographies[geographies.index(seed_geography) + 1:]
    seed_control_spec = control_spec[control_spec['geography'].isin(
        sub_geographies)]

    # determine master_control_index if specified in settings
    total_hh_control_col = settings.get('total_hh_control')

    max_expansion_factor = settings.get('max_expansion_factor', None)

    # run balancer for each seed geography
    weight_list = []

    seed_ids = crosswalk_df[seed_geography].unique()
    for seed_id in seed_ids:

        logger.info("initial_seed_balancing seed id %s" % seed_id)

        seed_incidence_df = incidence_df[incidence_df[seed_geography] ==
                                         seed_id]

        status, weights_df, controls_df = do_balancing(
            control_spec=seed_control_spec,
            total_hh_control_col=total_hh_control_col,
            max_expansion_factor=max_expansion_factor,
            incidence_df=seed_incidence_df,
            control_totals=seed_controls_df.loc[seed_id],
            initial_weights=seed_incidence_df['sample_weight'])

        logger.info("seed_balancer status: %s" % status)
        if not status['converged']:
            raise RuntimeError(
                "initial_seed_balancing for seed_id %s did not converge" %
                seed_id)

        balanced_weights = weights_df['final']

        logger.info("Total balanced weights for seed %s = %s" %
                    (seed_id, balanced_weights.sum()))

        weight_list.append(balanced_weights)

    # bulk concat all seed level results
    weights = pd.concat(weight_list)

    # build canonical weights table
    seed_weights_df = incidence_df[[seed_geography]].copy()
    seed_weights_df['preliminary_balanced_weight'] = weights

    # copy household_id_col index to named column
    seed_weights_df[setting('household_id_col')] = seed_weights_df.index

    # this is just a convenience if there are no meta controls
    if inject.get_step_arg('final', default=False):
        seed_weights_df['balanced_weight'] = seed_weights_df[
            'preliminary_balanced_weight']

    inject.add_table(weight_table_name(seed_geography), seed_weights_df)
Beispiel #29
0
def input_pre_processor():
    """
    Read input text files and save them as pipeline tables for use in subsequent steps.

    The files to read as specified by table_list, and array of dicts that specify the
    input file name, the name of the pipeline table, along with keys allow the specification
    of pre-processing steps.

    By default, reads table_list from 'input_table_list' in settings.yaml,
    unless an alternate table_list name is specified as a model step argument 'table_list'.
    (This allows alternate/additional input files to be read for repop)

    In the case of repop, this step is being run after an initial populationsim run has
    completed, in which case the input_table_list may specify replacement tables.
    (e.g. lowest geography controls that will replace the previous low controls dataframe.)

    See input_table_list in settings.yaml in the example folder for a working example

    +--------------+----------------------------------------------------------+
    | key          | description                                              |
    +==============+=========================================+================+
    | tablename    |  ame of pipeline table in which to store dataframe       |
    +--------------+----------------------------------------------------------+
    | filename     | name of csv file to read (in data_dir)                   |
    +--------------+----------------------------------------------------------+
    | column_map   | list of input columns to rename from_name: to_name       |
    +--------------+----------------------------------------------------------+
    | index_col    | name of column to set as dataframe index column          |
    +--------------+----------------------------------------------------------+
    | drop_columns | list of column names of columns to drop                  |
    +--------------+----------------------------------------------------------+

    """

    # alternate table list name may have been provided as a model argument
    table_list_name = inject.get_step_arg('table_list',
                                          default='input_table_list')
    table_list = setting(table_list_name)
    assert table_list is not None, "table list '%s' not in settings." % table_list_name

    data_dir = data_dir_from_settings()

    for table_info in table_list:

        tablename = table_info['tablename']

        logger.info("input_pre_processor processing %s" % tablename)

        # read the csv file
        data_filename = table_info.get('filename', None)
        data_file_path = os.path.join(data_dir, data_filename)
        if not os.path.exists(data_file_path):
            raise RuntimeError(
                "input_pre_processor %s - input file not found: %s" % (
                    tablename,
                    data_file_path,
                ))

        logger.info("Reading csv file %s" % data_file_path)
        df = pd.read_csv(data_file_path, comment='#')

        logger.info("input file columns: %s" % df.columns.values)

        drop_columns = table_info.get('drop_columns', None)
        if drop_columns:
            for c in drop_columns:
                logger.info("dropping column '%s'" % c)
                del df[c]

        # rename columns
        column_map = table_info.get('column_map', None)
        if column_map:
            df.rename(columns=column_map, inplace=True)

        # set index
        index_col = table_info.get('index_col', None)
        if index_col is not None:
            if index_col in df.columns:
                assert not df.duplicated(index_col).any()
                df.set_index(index_col, inplace=True)
            else:
                df.index.names = [index_col]

        # read expression file
        # expression_filename = table_info.get('expression_filename', None)
        # if expression_filename:
        #     assert False
        #     expression_file_path = os.path.join(configs_dir, expression_filename)
        #     if not os.path.exists(expression_file_path):
        #         raise RuntimeError("input_pre_processor %s - expression file not found: %s"
        #                            % (table, expression_file_path, ))
        #     spec = assign.read_assignment_spec(expression_file_path)
        #
        #     df_alias = table_info.get('df_alias', table)
        #
        #     locals_d = {}
        #
        #     results, trace_results, trace_assigned_locals \
        #         = assign.assign_variables(spec, df, locals_d, df_alias=df_alias)
        #     # for column in results.columns:
        #     #     orca.add_column(table, column, results[column])
        #
        #     df = pd.concat([df, results], axis=1)

        logger.info("adding table %s" % tablename)

        # add (or replace) pipeline table
        repop = inject.get_step_arg('repop', default=False)
        inject.add_table(tablename, df, replace=repop)
Beispiel #30
0
def write_tables(output_dir):
    """
    Write pipeline tables as csv files (in output directory) as specified by output_tables list
    in settings file.

    Pipeline tables are intermediate computational tables, not to be confused with the
    synthetic population tables written by the write_synthetic_population step.

    'output_tables' can specify either a list of output tables to include or to skip
    if no output_tables list is specified, then no checkpointed tables will be written

    Intermediate tables likely to be of particular interest or utility are the controls and weights
    tables for the various geographies. For example, if one of your geographies is TRACT, then:
    TRACT_controls has control totals for every TRACT (and aggregated subzone) controls.
    TRACT_weights has balanced_weight and integer_weight for every TRACT.

    To write all output tables EXCEPT the households and persons tables:

    ::

      output_tables:
        action: skip
        tables:
          - households
          - persons

    To write ONLY the expanded_household_ids table:

    ::

      output_tables:
        action: include
        tables:
           - expanded_household_ids

    Parameters
    ----------
    output_dir: str

    """

    output_tables_settings_name = 'output_tables'

    output_tables_settings = setting(output_tables_settings_name)

    output_tables_list = pipeline.checkpointed_tables()

    if output_tables_settings is None:
        logger.info("No output_tables specified in settings file. Nothing to write.")
        return

    action = output_tables_settings.get('action')
    tables = output_tables_settings.get('tables')

    if action not in ['include', 'skip']:
        raise RuntimeError("expected %s action '%s' to be either 'include' or 'skip'" %
                           (output_tables_settings_name, action))

    if action == 'include':
        output_tables_list = tables
    elif action == 'skip':
        output_tables_list = [t for t in output_tables_list if t not in tables]

    # should provide option to also write checkpoints?
    # output_tables_list.append("checkpoints.csv")

    for table_name in output_tables_list:
        table = inject.get_table(table_name, None)

        if table is None:
            logger.warn("Skipping '%s': Table not found." % table_name)
            continue

        df = table.to_frame()
        file_name = "%s.csv" % table_name
        logger.info("writing output file %s" % file_name)
        file_path = os.path.join(output_dir, file_name)
        write_index = df.index.name is not None
        df.to_csv(file_path, index=write_index)