def input_pre_processor(): """ Read input text files and save them as pipeline tables for use in subsequent steps. The files to read as specified by table_list, and array of dicts that specify the input file name, the name of the pipeline table, along with keys allow the specification of pre-processing steps. By default, reads table_list from 'input_table_list' in settings.yaml, unless an alternate table_list name is specified as a model step argument 'table_list'. (This allows alternate/additional input files to be read for repop) In the case of repop, this step is being run after an initial run has completed, in which case the input_table_list may specify replacement tables. (e.g. lowest geography controls that will replace the previous low controls dataframe.) See input_table_list in settings.yaml in the example folder for a working example +--------------+----------------------------------------------------------+ | key | description | +==============+=========================================+================+ | tablename | name of pipeline table in which to store dataframe | +--------------+----------------------------------------------------------+ | filename | name of csv file to read (in data_dir) | +--------------+----------------------------------------------------------+ | column_map | list of input columns to rename from_name: to_name | +--------------+----------------------------------------------------------+ | index_col | name of column to set as dataframe index column | +--------------+----------------------------------------------------------+ | drop_columns | list of column names of columns to drop | +--------------+----------------------------------------------------------+ """ # alternate table list name may have been provided as a model argument table_list_name = inject.get_step_arg('table_list', default='input_table_list') table_list = config.setting(table_list_name) assert table_list is not None, "no table list '%s' found in settings." % table_list_name logger.info('Using table list: %s' % table_list) for table_info in table_list: tablename = table_info.get('tablename') df = input.read_from_table_info(table_info) logger.info('registering table %s' % tablename) # add (or replace) pipeline table repop = inject.get_step_arg('repop', default=False) inject.add_table(tablename, df, replace=repop)
def step_add_col(): table_name = inject.get_step_arg('table_name') assert table_name is not None col_name = inject.get_step_arg('column_name') assert col_name is not None table = pipeline.get_table(table_name) assert col_name not in table.columns table[col_name] = table.index + (1000 * len(table.columns)) pipeline.replace_table(table_name, table)
def step2(): table_name = inject.get_step_arg('table_name') assert table_name is not None table2 = pd.DataFrame({'column1': [10, 20, 30]}) inject.add_table(table_name, table2)
def step_forget_tab(): table_name = inject.get_step_arg('table_name') assert table_name is not None table = pipeline.get_table(table_name) pipeline.drop_table(table_name)
def out_table(table_name, df): table_name = "summary_%s" % table_name if AS_CSV: file_name = "%s.csv" % table_name output_dir = inject.get_injectable('output_dir') file_path = os.path.join(output_dir, file_name) logger.info("writing output file %s" % file_path) write_index = df.index.name is not None df.to_csv(file_path, index=write_index) else: logger.info("saving summary table %s" % table_name) repop = inject.get_step_arg('repop', default=False) inject.add_table(table_name, df, replace=repop)
def annotate_table(configs_dir): # model_settings name should have been provided as a step argument model_name = inject.get_step_arg('model_name') model_settings = config.read_model_settings(configs_dir, '%s.yaml' % model_name) df_name = model_settings['DF'] df = inject.get_table(df_name).to_frame() results = expressions.compute_columns( df, model_settings=model_settings, configs_dir=configs_dir, trace_label=None) assign_in_place(df, results) pipeline.replace_table(df_name, df)
def input_pre_processor(): """ Read input text files and save them as pipeline tables for use in subsequent steps. The files to read as specified by table_list, and array of dicts that specify the input file name, the name of the pipeline table, along with keys allow the specification of pre-processing steps. By default, reads table_list from 'input_table_list' in settings.yaml, unless an alternate table_list name is specified as a model step argument 'table_list'. (This allows alternate/additional input files to be read for repop) In the case of repop, this step is being run after an initial populationsim run has completed, in which case the input_table_list may specify replacement tables. (e.g. lowest geography controls that will replace the previous low controls dataframe.) See input_table_list in settings.yaml in the example folder for a working example +--------------+----------------------------------------------------------+ | key | description | +==============+=========================================+================+ | tablename | ame of pipeline table in which to store dataframe | +--------------+----------------------------------------------------------+ | filename | name of csv file to read (in data_dir) | +--------------+----------------------------------------------------------+ | column_map | list of input columns to rename from_name: to_name | +--------------+----------------------------------------------------------+ | index_col | name of column to set as dataframe index column | +--------------+----------------------------------------------------------+ | drop_columns | list of column names of columns to drop | +--------------+----------------------------------------------------------+ """ # alternate table list name may have been provided as a model argument table_list_name = inject.get_step_arg('table_list', default='input_table_list') table_list = setting(table_list_name) assert table_list is not None, "table list '%s' not in settings." % table_list_name data_dir = data_dir_from_settings() for table_info in table_list: tablename = table_info['tablename'] logger.info("input_pre_processor processing %s" % tablename) # read the csv file data_filename = table_info.get('filename', None) data_file_path = os.path.join(data_dir, data_filename) if not os.path.exists(data_file_path): raise RuntimeError( "input_pre_processor %s - input file not found: %s" % ( tablename, data_file_path, )) logger.info("Reading csv file %s" % data_file_path) df = pd.read_csv(data_file_path, comment='#') logger.info("input file columns: %s" % df.columns.values) drop_columns = table_info.get('drop_columns', None) if drop_columns: for c in drop_columns: logger.info("dropping column '%s'" % c) del df[c] # rename columns column_map = table_info.get('column_map', None) if column_map: df.rename(columns=column_map, inplace=True) # set index index_col = table_info.get('index_col', None) if index_col is not None: if index_col in df.columns: assert not df.duplicated(index_col).any() df.set_index(index_col, inplace=True) else: df.index.names = [index_col] # read expression file # expression_filename = table_info.get('expression_filename', None) # if expression_filename: # assert False # expression_file_path = os.path.join(configs_dir, expression_filename) # if not os.path.exists(expression_file_path): # raise RuntimeError("input_pre_processor %s - expression file not found: %s" # % (table, expression_file_path, )) # spec = assign.read_assignment_spec(expression_file_path) # # df_alias = table_info.get('df_alias', table) # # locals_d = {} # # results, trace_results, trace_assigned_locals \ # = assign.assign_variables(spec, df, locals_d, df_alias=df_alias) # # for column in results.columns: # # orca.add_column(table, column, results[column]) # # df = pd.concat([df, results], axis=1) logger.info("adding table %s" % tablename) # add (or replace) pipeline table repop = inject.get_step_arg('repop', default=False) inject.add_table(tablename, df, replace=repop)
def initial_seed_balancing(settings, crosswalk, control_spec, incidence_table): """ Balance the household weights for each of the seed geographies (independently) using the seed level controls and the aggregated sub-zone controls totals. Create the seed_weights table with one row per household and columns contaiing household_id, seed geography (e.g. PUMA), and float preliminary_balanced_weights Adds seed_weights table to pipeline named <seed_geography>_weights (e.g. PUMA_weights): +--------+------+-----------------------------+-------+ | index | PUMA | preliminary_balanced_weight | hh_id | | hh_id | | | | +========+======+=============================+=======+ | 0 | 600 | 0.313555 | 0 | | 1 | 601 | 0.627110 | 1 | | 2 | 602 | 0.313555 | 2 | | ... | | | | +--------+------+-----------------------------+-------+ Parameters ---------- settings : dict (settings.yaml as dict) crosswalk : pipeline table control_spec : pipeline table incidence_table : pipeline table """ crosswalk_df = crosswalk.to_frame() incidence_df = incidence_table.to_frame() control_spec = control_spec.to_frame() seed_geography = settings.get('seed_geography') seed_controls_df = get_control_table(seed_geography) # only want control_spec rows for seed geography and below geographies = settings['geographies'] seed_geographies = geographies[geographies.index(seed_geography):] seed_control_spec = control_spec[control_spec['geography'].isin( seed_geographies)] # determine master_control_index if specified in settings total_hh_control_col = setting('total_hh_control') max_expansion_factor = settings.get('max_expansion_factor', None) min_expansion_factor = settings.get('min_expansion_factor', None) # run balancer for each seed geography weight_list = [] sample_weight_list = [] seed_ids = crosswalk_df[seed_geography].unique() for seed_id in seed_ids: logger.info("initial_seed_balancing seed id %s" % seed_id) seed_incidence_df = incidence_df[incidence_df[seed_geography] == seed_id] status, weights_df, controls_df = do_balancing( control_spec=seed_control_spec, total_hh_control_col=total_hh_control_col, max_expansion_factor=max_expansion_factor, min_expansion_factor=min_expansion_factor, incidence_df=seed_incidence_df, control_totals=seed_controls_df.loc[seed_id], initial_weights=seed_incidence_df['sample_weight']) logger.info("seed_balancer status: %s" % status) if not status['converged']: raise RuntimeError( "initial_seed_balancing for seed_id %s did not converge" % seed_id) balanced_weights = weights_df['final'] logger.info("Total balanced weights for seed %s = %s" % (seed_id, balanced_weights.sum())) weight_list.append(balanced_weights) sample_weight_list.append(seed_incidence_df['sample_weight']) # bulk concat all seed level results weights = pd.concat(weight_list) sample_weights = pd.concat(sample_weight_list) # build canonical weights table seed_weights_df = incidence_df[[seed_geography]].copy() seed_weights_df['preliminary_balanced_weight'] = weights seed_weights_df['sample_weight'] = sample_weights # copy household_id_col index to named column seed_weights_df[setting('household_id_col')] = seed_weights_df.index # this is just a convenience if there are no meta controls if inject.get_step_arg('final', default=False): seed_weights_df['balanced_weight'] = seed_weights_df[ 'preliminary_balanced_weight'] repop = inject.get_step_arg('repop', default=False) inject.add_table(weight_table_name(seed_geography), seed_weights_df, replace=repop)
def sub_balancing(settings, crosswalk, control_spec, incidence_table): """ Simul-balance and integerize all zones at a specified geographic level in groups by parent zone. For instance, if the 'geography' step arg is 'TRACT' and the parent geography is 'SEED', then for each seed zone, we simul-balance the TRACTS it contains. Creates a weight table for the target geography with float 'balanced_weight' and 'integer_weight' columns. Parameters ---------- settings : dict (settings.yaml as dict) crosswalk : pipeline table control_spec : pipeline table incidence_table : pipeline table Returns ------- """ # geography is an injected model step arg geography = inject.get_step_arg('geography') crosswalk_df = crosswalk.to_frame() incidence_df = incidence_table.to_frame() control_spec = control_spec.to_frame() geographies = settings.get('geographies') seed_geography = settings.get('seed_geography') meta_geography = geographies[0] parent_geography = geographies[geographies.index(geography) - 1] sub_geographies = geographies[geographies.index(geography):] parent_geographies = geographies[:geographies.index(geography)] total_hh_control_col = setting('total_hh_control') parent_controls_df = get_control_table(parent_geography) sub_controls_df = get_control_table(geography) weights_df = get_weight_table(parent_geography) assert weights_df is not None integer_weights_list = [] # the incidence table is siloed by seed geography, se we handle each seed zone in turn seed_ids = crosswalk_df[seed_geography].unique() for seed_id in seed_ids: # slice incidence and crosswalk tables for this seed zone seed_incidence_df = incidence_df[incidence_df[seed_geography] == seed_id] seed_crosswalk_df = crosswalk_df[crosswalk_df[seed_geography] == seed_id] assert len(seed_crosswalk_df[meta_geography].unique()) == 1 # list of unique parent zone ids in this seed zone # (there will be just one if parent geo is seed) parent_ids = seed_crosswalk_df[parent_geography].unique() # only want ones for which there are (non-zero) controls parent_ids = parent_controls_df.index.intersection(parent_ids) for parent_id in parent_ids: logger.info("balancing seed %s, %s %s" % (seed_id, parent_geography, parent_id)) initial_weights = weights_df[weights_df[parent_geography] == parent_id] initial_weights = initial_weights.set_index( settings.get('household_id_col')) # using balanced_weight slows down simul and doesn't improve results # (float seeds means no zero-weight households to drop) if setting('SUB_BALANCE_WITH_FLOAT_SEED_WEIGHTS', True): initial_weights = initial_weights['balanced_weight'] else: initial_weights = initial_weights['integer_weight'] assert len(initial_weights.index) == len(seed_incidence_df.index) zone_weights_df = balance_and_integerize( incidence_df=seed_incidence_df, parent_weights=initial_weights, sub_controls_df=sub_controls_df, control_spec=control_spec, total_hh_control_col=total_hh_control_col, parent_geography=parent_geography, parent_id=parent_id, sub_geographies=sub_geographies, crosswalk_df=seed_crosswalk_df) # add higher level geography id columns to facilitate summaries parent_geography_ids = \ crosswalk_df.loc[crosswalk_df[parent_geography] == parent_id, parent_geographies]\ .max(axis=0) for z in parent_geography_ids.index: zone_weights_df[z] = parent_geography_ids[z] integer_weights_list.append(zone_weights_df) integer_weights_df = pd.concat(integer_weights_list) inject.add_table(weight_table_name(geography), integer_weights_df) inject.add_table( weight_table_name(geography, sparse=True), integer_weights_df[integer_weights_df['integer_weight'] > 0]) if 'trace_geography' in settings and geography in settings[ 'trace_geography']: trace_geography_id = settings.get('trace_geography')[geography] df = integer_weights_df[integer_weights_df[geography] == trace_geography_id] inject.add_table('trace_%s' % weight_table_name(geography), df)
def expand_households(): """ Create a complete expanded synthetic household list with their assigned geographic zone ids. This is the skeleton synthetic household id list with no household or person attributes, one row per household, with geography columns and seed household table household_id. Creates pipeline table expanded_household_ids """ if setting('NO_INTEGERIZATION_EVER', False): logger.warning("skipping expand_households: NO_INTEGERIZATION_EVER") inject.add_table('expanded_household_ids', pd.DataFrame()) return geographies = setting('geographies') household_id_col = setting('household_id_col') low_geography = geographies[-1] # only one we really need is low_geography seed_geography = setting('seed_geography') geography_cols = geographies[geographies.index(seed_geography):] weights = get_weight_table(low_geography, sparse=True) weights = weights[geography_cols + [household_id_col, 'integer_weight']] # - expand weights table by integer_weight, so there is one row per desired hh weight_cols = weights.columns.values weights_np = np.repeat(weights.values, weights.integer_weight.values, axis=0) expanded_weights = pd.DataFrame(data=weights_np, columns=weight_cols) if setting('GROUP_BY_INCIDENCE_SIGNATURE'): # the household_id_col is really the group_id expanded_weights.rename(columns={household_id_col: 'group_id'}, inplace=True) # the original incidence table with one row per hh, with index hh_id household_groups = pipeline.get_table('household_groups') household_groups = household_groups[[ household_id_col, 'group_id', 'sample_weight' ]] # for each group, lists of hh_ids and their sample_weights (as relative probabiliities) # [ [ [<group_0_hh_id_list>], [<group_0_hh_prob_list>] ], # [ [<group_1_hh_id_list>], [<group_1_hh_prob_list>] ], ... ] HH_IDS = 0 HH_PROBS = 1 grouper = household_groups.groupby('group_id') group_hh_probs = [0] * len(grouper) for group_id, df in grouper: hh_ids = list(df[household_id_col]) probs = list(df.sample_weight / df.sample_weight.sum()) group_hh_probs[group_id] = [hh_ids, probs] # FIXME - should sample without replacement? # now make a hh_id choice for each group_id in expanded_weights def chooser(group_id): hh_ids = group_hh_probs[group_id][HH_IDS] hh_probs = group_hh_probs[group_id][HH_PROBS] return np.random.choice(hh_ids, p=hh_probs) expanded_weights[household_id_col] = \ expanded_weights.group_id.apply(chooser, convert_dtype=True,) # FIXME - omit in production? del expanded_weights['group_id'] del expanded_weights['integer_weight'] append = inject.get_step_arg('append', False) replace = inject.get_step_arg('replace', False) assert not ( append and replace), "can't specify both append and replace for expand_households" if append or replace: t = inject.get_table('expanded_household_ids').to_frame() prev_hhs = len(t.index) added_hhs = len(expanded_weights.index) if replace: # FIXME - should really get from crosswalk table? low_ids_to_replace = expanded_weights[low_geography].unique() t = t[~t[low_geography].isin(low_ids_to_replace)] expanded_weights = pd.concat([t, expanded_weights], ignore_index=True) dropped_hhs = prev_hhs - len(t.index) final_hhs = len(expanded_weights.index) op = 'append' if append else 'replace' logger.info( "expand_households op: %s prev hh count %s dropped %s added %s final %s" % (op, prev_hhs, dropped_hhs, added_hhs, final_hhs)) repop = inject.get_step_arg('repop', default=False) inject.add_table('expanded_household_ids', expanded_weights, replace=repop)
def sub_balancing(settings, crosswalk, control_spec, incidence_table): # geography is an injected model step arg geography = inject.get_step_arg('geography') crosswalk_df = crosswalk.to_frame() incidence_df = incidence_table.to_frame() control_spec = control_spec.to_frame() geographies = settings.get('geographies') seed_geography = settings.get('seed_geography') meta_geography = geographies[0] parent_geography = geographies[geographies.index(geography) - 1] sub_geographies = geographies[geographies.index(geography):] parent_geographies = geographies[:geographies.index(geography)] total_hh_control_col = settings.get('total_hh_control') sub_controls_df = get_control_table(geography) weights_df = get_weight_table(parent_geography) assert weights_df is not None integer_weights_list = [] seed_ids = crosswalk_df[seed_geography].unique() for seed_id in seed_ids: seed_incidence_df = incidence_df[incidence_df[seed_geography] == seed_id] seed_crosswalk_df = crosswalk_df[crosswalk_df[seed_geography] == seed_id] assert len(seed_crosswalk_df[meta_geography].unique()) == 1 parent_ids = seed_crosswalk_df[parent_geography].unique() for parent_id in parent_ids: logger.info("balancing seed %s, %s %s" % (seed_id, parent_geography, parent_id)) initial_weights = weights_df[weights_df[parent_geography] == parent_id] initial_weights = initial_weights.set_index( settings.get('household_id_col')) # using balanced_weight slows down simul and doesn't improve results # (float seeds means no zero-weight households to drop) if setting('SUB_BALANCE_WITH_FLOAT_SEED_WEIGHTS', True): initial_weights = initial_weights['balanced_weight'] else: initial_weights = initial_weights['integer_weight'] assert len(initial_weights.index) == len(seed_incidence_df.index) zone_weights_df = balance_and_integerize( incidence_df=seed_incidence_df, parent_weights=initial_weights, sub_controls_df=sub_controls_df, control_spec=control_spec, total_hh_control_col=total_hh_control_col, parent_geography=parent_geography, parent_id=parent_id, sub_geographies=sub_geographies, crosswalk_df=seed_crosswalk_df) # add higher level geography id columns to facilitate summaries parent_geography_ids = \ crosswalk_df.loc[crosswalk_df[parent_geography] == parent_id, parent_geographies]\ .max(axis=0) for z in parent_geography_ids.index: zone_weights_df[z] = parent_geography_ids[z] integer_weights_list.append(zone_weights_df) integer_weights_df = pd.concat(integer_weights_list) inject.add_table(weight_table_name(geography), integer_weights_df) inject.add_table( weight_table_name(geography, sparse=True), integer_weights_df[integer_weights_df['integer_weight'] > 0]) if 'trace_geography' in settings and geography in settings[ 'trace_geography']: trace_geography_id = settings.get('trace_geography')[geography] df = integer_weights_df[integer_weights_df[geography] == trace_geography_id] inject.add_table('trace_%s' % weight_table_name(geography), df)
def input_pre_processor(): # alternate table list name may have been provided as a model argument table_list_name = inject.get_step_arg('table_list', default='input_table_list') table_list = setting(table_list_name) assert table_list is not None, "table list '%s' not in settings." % table_list_name data_dir = data_dir_from_settings() for table_info in table_list: tablename = table_info['tablename'] logger.info("input_pre_processor processing %s" % tablename) # read the csv file data_filename = table_info.get('filename', None) data_file_path = os.path.join(data_dir, data_filename) if not os.path.exists(data_file_path): raise RuntimeError( "input_pre_processor %s - input file not found: %s" % ( tablename, data_file_path, )) logger.info("Reading csv file %s" % data_file_path) df = pd.read_csv(data_file_path, comment='#') print df.columns drop_columns = table_info.get('drop_columns', None) if drop_columns: for c in drop_columns: logger.info("dropping column '%s'" % c) del df[c] # rename columns column_map = table_info.get('column_map', None) if column_map: df.rename(columns=column_map, inplace=True) # set index index_col = table_info.get('index_col', None) if index_col is not None: if index_col in df.columns: assert not df.duplicated(index_col).any() df.set_index(index_col, inplace=True) else: df.index.names = [index_col] # read expression file # expression_filename = table_info.get('expression_filename', None) # if expression_filename: # assert False # expression_file_path = os.path.join(configs_dir, expression_filename) # if not os.path.exists(expression_file_path): # raise RuntimeError("input_pre_processor %s - expression file not found: %s" # % (table, expression_file_path, )) # spec = assign.read_assignment_spec(expression_file_path) # # df_alias = table_info.get('df_alias', table) # # locals_d = {} # # results, trace_results, trace_assigned_locals \ # = assign.assign_variables(spec, df, locals_d, df_alias=df_alias) # # for column in results.columns: # # orca.add_column(table, column, results[column]) # # df = pd.concat([df, results], axis=1) logger.info("adding table %s" % tablename) inject.add_table(tablename, df)
def initial_seed_balancing(settings, crosswalk, control_spec, incidence_table): crosswalk_df = crosswalk.to_frame() incidence_df = incidence_table.to_frame() control_spec = control_spec.to_frame() seed_geography = settings.get('seed_geography') seed_controls_df = get_control_table(seed_geography) # only want control_spec rows for sub_geographies geographies = settings['geographies'] sub_geographies = geographies[geographies.index(seed_geography) + 1:] seed_control_spec = control_spec[control_spec['geography'].isin( sub_geographies)] # determine master_control_index if specified in settings total_hh_control_col = settings.get('total_hh_control') max_expansion_factor = settings.get('max_expansion_factor', None) # run balancer for each seed geography weight_list = [] seed_ids = crosswalk_df[seed_geography].unique() for seed_id in seed_ids: logger.info("initial_seed_balancing seed id %s" % seed_id) seed_incidence_df = incidence_df[incidence_df[seed_geography] == seed_id] status, weights_df, controls_df = do_balancing( control_spec=seed_control_spec, total_hh_control_col=total_hh_control_col, max_expansion_factor=max_expansion_factor, incidence_df=seed_incidence_df, control_totals=seed_controls_df.loc[seed_id], initial_weights=seed_incidence_df['sample_weight']) logger.info("seed_balancer status: %s" % status) if not status['converged']: raise RuntimeError( "initial_seed_balancing for seed_id %s did not converge" % seed_id) balanced_weights = weights_df['final'] logger.info("Total balanced weights for seed %s = %s" % (seed_id, balanced_weights.sum())) weight_list.append(balanced_weights) # bulk concat all seed level results weights = pd.concat(weight_list) # build canonical weights table seed_weights_df = incidence_df[[seed_geography]].copy() seed_weights_df['preliminary_balanced_weight'] = weights # copy household_id_col index to named column seed_weights_df[setting('household_id_col')] = seed_weights_df.index # this is just a convenience if there are no meta controls if inject.get_step_arg('final', default=False): seed_weights_df['balanced_weight'] = seed_weights_df[ 'preliminary_balanced_weight'] inject.add_table(weight_table_name(seed_geography), seed_weights_df)