def summarize_geography(geography, weight_col, crosswalk_df, results_df, incidence_df): # controls_table for current geography level controls_df = get_control_table(geography) control_names = controls_df.columns.tolist() # only want zones from crosswalk for which non-zero control rows exist zone_ids = crosswalk_df[geography].unique() zone_ids = controls_df.index.intersection(zone_ids) results = [] controls = [] for zone_id in zone_ids: zone_controls = controls_df.loc[zone_id].tolist() controls.append(zone_controls) zone_row_map = results_df[geography] == zone_id zone_weights = results_df[zone_row_map] incidence = incidence_df.loc[zone_weights.hh_id] weights = zone_weights[weight_col].tolist() x = [(incidence[c] * weights).sum() for c in control_names] results.append(x) controls_df = pd.DataFrame( data=np.asanyarray(controls), columns=['%s_control' % c for c in control_names], index=zone_ids) summary_df = pd.DataFrame(data=np.asanyarray(results), columns=['%s_result' % c for c in control_names], index=zone_ids) dif_df = pd.DataFrame(data=np.asanyarray(results) - np.asanyarray(controls), columns=['%s_diff' % c for c in control_names], index=zone_ids) summary_df = pd.concat([controls_df, summary_df, dif_df], axis=1) summary_cols = summary_df.columns.tolist() summary_df['geography'] = geography summary_df['id'] = summary_df.index summary_df.index = summary_df['geography'] + '_' + summary_df['id'].astype( str) summary_df = summary_df[['geography', 'id'] + summary_cols] return summary_df
def meta_summary(incidence_df, control_spec, top_geography, top_id, sub_geographies): incidence_df = incidence_df[incidence_df[top_geography] == top_id] control_cols = control_spec.target.values controls_df = get_control_table(top_geography) # controls for this geography as series controls = controls_df[control_cols].loc[top_id] incidence = incidence_df[control_cols] summary = pd.DataFrame(index=control_cols) summary.index.name = 'control_name' summary['control_value'] = controls seed_geography = setting('seed_geography') seed_weights_df = get_weight_table(seed_geography) seed_weight_cols = [ 'preliminary_balanced_weight', 'balanced_weight', 'integer_weight' ] for c in seed_weight_cols: if c in seed_weights_df: summary_col_name = '%s_%s' % (top_geography, c) summary[summary_col_name] = \ incidence.multiply(seed_weights_df[c], axis="index").sum(axis=0) for g in sub_geographies: sub_weight_cols = ['balanced_weight', 'integer_weight'] sub_weights = get_weight_table(g) if sub_weights is None: continue sub_weights = sub_weights[sub_weights[top_geography] == top_id] sub_weights = sub_weights[[setting('household_id_col')] + sub_weight_cols].groupby( setting('household_id_col')).sum() for c in sub_weight_cols: summary['%s_%s' % (g, c)] = \ incidence.multiply(sub_weights[c], axis="index").sum(axis=0) return summary
def integerize_final_seed_weights(settings, crosswalk, control_spec, incidence_table): crosswalk_df = crosswalk.to_frame() incidence_df = incidence_table.to_frame() control_spec = control_spec.to_frame() seed_geography = settings.get('seed_geography') seed_controls_df = get_control_table(seed_geography) seed_weights_df = get_weight_table(seed_geography) # FIXME - I assume we want to integerize using meta controls too? control_cols = control_spec.target assert (seed_controls_df.columns == control_cols).all() # determine master_control_index if specified in settings total_hh_control_col = settings.get('total_hh_control') # run balancer for each seed geography weight_list = [] seed_ids = crosswalk_df[seed_geography].unique() for seed_id in seed_ids: logger.info("integerize_final_seed_weights seed id %s" % seed_id) # slice incidence rows for this seed geography seed_incidence = incidence_df[incidence_df[seed_geography] == seed_id] balanced_seed_weights = \ seed_weights_df.loc[seed_weights_df[seed_geography] == seed_id, 'balanced_weight'] trace_label = "%s_%s" % (seed_geography, seed_id) integer_weights, status = do_integerizing( trace_label=trace_label, control_spec=control_spec, control_totals=seed_controls_df.loc[seed_id], incidence_table=seed_incidence[control_cols], float_weights=balanced_seed_weights, total_hh_control_col=total_hh_control_col ) weight_list.append(integer_weights) # bulk concat all seed level results integer_seed_weights = pd.concat(weight_list) inject.add_column(weight_table_name(seed_geography), 'integer_weight', integer_seed_weights)
def final_seed_balancing(settings, crosswalk, control_spec, incidence_table): crosswalk_df = crosswalk.to_frame() incidence_df = incidence_table.to_frame() control_spec = control_spec.to_frame() seed_geography = settings.get('seed_geography') seed_weight_table_name = weight_table_name(seed_geography) # if there are no meta controls, then balanced_weight is simply preliminary_balanced_weight geographies = settings['geographies'] if not (control_spec.geography == geographies[0]).any(): logger.warning( "no need for final_seed_balancing because no meta controls") seed_weights_df = get_weight_table(seed_geography) if 'balanced_weight' not in seed_weights_df: final_seed_weights = seed_weights_df['preliminary_balanced_weight'] inject.add_column(seed_weight_table_name, 'balanced_weight', final_seed_weights) return # we use all control_spec rows, so no need to filter on geography as for initial_seed_balancing seed_controls_df = get_control_table(seed_geography) assert (seed_controls_df.columns == control_spec.target).all() # determine master_control_index if specified in settings total_hh_control_col = settings.get('total_hh_control') max_expansion_factor = settings.get('max_expansion_factor', None) relaxation_factors = pd.DataFrame(index=seed_controls_df.columns.tolist()) # run balancer for each seed geography weight_list = [] seed_ids = crosswalk_df[seed_geography].unique() for seed_id in seed_ids: logger.info("initial_seed_balancing seed id %s" % seed_id) seed_incidence_df = incidence_df[incidence_df[seed_geography] == seed_id] status, weights_df, controls_df = do_balancing( control_spec=control_spec, total_hh_control_col=total_hh_control_col, max_expansion_factor=max_expansion_factor, incidence_df=seed_incidence_df, control_totals=seed_controls_df.loc[seed_id], initial_weights=seed_incidence_df['sample_weight']) logger.info("seed_balancer status: %s" % status) if not status['converged']: raise RuntimeError( "final_seed_balancing for seed_id %s did not converge" % seed_id) weight_list.append(weights_df['final']) relaxation_factors[seed_id] = controls_df['relaxation_factor'] # bulk concat all seed level results final_seed_weights = pd.concat(weight_list) inject.add_column(seed_weight_table_name, 'balanced_weight', final_seed_weights)
def sub_balancing(settings, crosswalk, control_spec, incidence_table): """ Simul-balance and integerize all zones at a specified geographic level in groups by parent zone. For instance, if the 'geography' step arg is 'TRACT' and the parent geography is 'SEED', then for each seed zone, we simul-balance the TRACTS it contains. Creates a weight table for the target geography with float 'balanced_weight' and 'integer_weight' columns. Parameters ---------- settings : dict (settings.yaml as dict) crosswalk : pipeline table control_spec : pipeline table incidence_table : pipeline table Returns ------- """ # geography is an injected model step arg geography = inject.get_step_arg('geography') crosswalk_df = crosswalk.to_frame() incidence_df = incidence_table.to_frame() control_spec = control_spec.to_frame() geographies = settings.get('geographies') seed_geography = settings.get('seed_geography') meta_geography = geographies[0] parent_geography = geographies[geographies.index(geography) - 1] sub_geographies = geographies[geographies.index(geography):] parent_geographies = geographies[:geographies.index(geography)] total_hh_control_col = setting('total_hh_control') parent_controls_df = get_control_table(parent_geography) sub_controls_df = get_control_table(geography) weights_df = get_weight_table(parent_geography) assert weights_df is not None integer_weights_list = [] # the incidence table is siloed by seed geography, se we handle each seed zone in turn seed_ids = crosswalk_df[seed_geography].unique() for seed_id in seed_ids: # slice incidence and crosswalk tables for this seed zone seed_incidence_df = incidence_df[incidence_df[seed_geography] == seed_id] seed_crosswalk_df = crosswalk_df[crosswalk_df[seed_geography] == seed_id] assert len(seed_crosswalk_df[meta_geography].unique()) == 1 # list of unique parent zone ids in this seed zone # (there will be just one if parent geo is seed) parent_ids = seed_crosswalk_df[parent_geography].unique() # only want ones for which there are (non-zero) controls parent_ids = parent_controls_df.index.intersection(parent_ids) for parent_id in parent_ids: logger.info("balancing seed %s, %s %s" % (seed_id, parent_geography, parent_id)) initial_weights = weights_df[weights_df[parent_geography] == parent_id] initial_weights = initial_weights.set_index( settings.get('household_id_col')) # using balanced_weight slows down simul and doesn't improve results # (float seeds means no zero-weight households to drop) if setting('SUB_BALANCE_WITH_FLOAT_SEED_WEIGHTS', True): initial_weights = initial_weights['balanced_weight'] else: initial_weights = initial_weights['integer_weight'] assert len(initial_weights.index) == len(seed_incidence_df.index) zone_weights_df = balance_and_integerize( incidence_df=seed_incidence_df, parent_weights=initial_weights, sub_controls_df=sub_controls_df, control_spec=control_spec, total_hh_control_col=total_hh_control_col, parent_geography=parent_geography, parent_id=parent_id, sub_geographies=sub_geographies, crosswalk_df=seed_crosswalk_df) # add higher level geography id columns to facilitate summaries parent_geography_ids = \ crosswalk_df.loc[crosswalk_df[parent_geography] == parent_id, parent_geographies]\ .max(axis=0) for z in parent_geography_ids.index: zone_weights_df[z] = parent_geography_ids[z] integer_weights_list.append(zone_weights_df) integer_weights_df = pd.concat(integer_weights_list) inject.add_table(weight_table_name(geography), integer_weights_df) inject.add_table( weight_table_name(geography, sparse=True), integer_weights_df[integer_weights_df['integer_weight'] > 0]) if 'trace_geography' in settings and geography in settings[ 'trace_geography']: trace_geography_id = settings.get('trace_geography')[geography] df = integer_weights_df[integer_weights_df[geography] == trace_geography_id] inject.add_table('trace_%s' % weight_table_name(geography), df)
def initial_seed_balancing(settings, crosswalk, control_spec, incidence_table): """ Balance the household weights for each of the seed geographies (independently) using the seed level controls and the aggregated sub-zone controls totals. Create the seed_weights table with one row per household and columns contaiing household_id, seed geography (e.g. PUMA), and float preliminary_balanced_weights Adds seed_weights table to pipeline named <seed_geography>_weights (e.g. PUMA_weights): +--------+------+-----------------------------+-------+ | index | PUMA | preliminary_balanced_weight | hh_id | | hh_id | | | | +========+======+=============================+=======+ | 0 | 600 | 0.313555 | 0 | | 1 | 601 | 0.627110 | 1 | | 2 | 602 | 0.313555 | 2 | | ... | | | | +--------+------+-----------------------------+-------+ Parameters ---------- settings : dict (settings.yaml as dict) crosswalk : pipeline table control_spec : pipeline table incidence_table : pipeline table """ crosswalk_df = crosswalk.to_frame() incidence_df = incidence_table.to_frame() control_spec = control_spec.to_frame() seed_geography = settings.get('seed_geography') seed_controls_df = get_control_table(seed_geography) # only want control_spec rows for sub_geographies geographies = settings['geographies'] sub_geographies = geographies[geographies.index(seed_geography) + 1:] seed_control_spec = control_spec[control_spec['geography'].isin( sub_geographies)] # determine master_control_index if specified in settings total_hh_control_col = settings.get('total_hh_control') max_expansion_factor = settings.get('max_expansion_factor', None) # run balancer for each seed geography weight_list = [] seed_ids = crosswalk_df[seed_geography].unique() for seed_id in seed_ids: logger.info("initial_seed_balancing seed id %s" % seed_id) seed_incidence_df = incidence_df[incidence_df[seed_geography] == seed_id] status, weights_df, controls_df = do_balancing( control_spec=seed_control_spec, total_hh_control_col=total_hh_control_col, max_expansion_factor=max_expansion_factor, incidence_df=seed_incidence_df, control_totals=seed_controls_df.loc[seed_id], initial_weights=seed_incidence_df['sample_weight']) logger.info("seed_balancer status: %s" % status) if not status['converged']: raise RuntimeError( "initial_seed_balancing for seed_id %s did not converge" % seed_id) balanced_weights = weights_df['final'] logger.info("Total balanced weights for seed %s = %s" % (seed_id, balanced_weights.sum())) weight_list.append(balanced_weights) # bulk concat all seed level results weights = pd.concat(weight_list) # build canonical weights table seed_weights_df = incidence_df[[seed_geography]].copy() seed_weights_df['preliminary_balanced_weight'] = weights # copy household_id_col index to named column seed_weights_df[setting('household_id_col')] = seed_weights_df.index # this is just a convenience if there are no meta controls if inject.get_step_arg('final', default=False): seed_weights_df['balanced_weight'] = seed_weights_df[ 'preliminary_balanced_weight'] inject.add_table(weight_table_name(seed_geography), seed_weights_df)
def integerize_final_seed_weights(settings, crosswalk, control_spec, incidence_table): """ Final balancing for each seed (puma) zone with aggregated low and mid-level controls and distributed meta-level controls. Adds integer_weight column to seed-level weight table Parameters ---------- settings : dict (settings.yaml as dict) crosswalk : pipeline table control_spec : pipeline table incidence_table : pipeline table Returns ------- """ crosswalk_df = crosswalk.to_frame() incidence_df = incidence_table.to_frame() control_spec = control_spec.to_frame() seed_geography = settings.get('seed_geography') seed_controls_df = get_control_table(seed_geography) seed_weights_df = get_weight_table(seed_geography) # FIXME - I assume we want to integerize using meta controls too? control_cols = control_spec.target assert (seed_controls_df.columns == control_cols).all() # determine master_control_index if specified in settings total_hh_control_col = settings.get('total_hh_control') # run balancer for each seed geography weight_list = [] seed_ids = crosswalk_df[seed_geography].unique() for seed_id in seed_ids: logger.info("integerize_final_seed_weights seed id %s" % seed_id) # slice incidence rows for this seed geography seed_incidence = incidence_df[incidence_df[seed_geography] == seed_id] balanced_seed_weights = \ seed_weights_df.loc[seed_weights_df[seed_geography] == seed_id, 'balanced_weight'] trace_label = "%s_%s" % (seed_geography, seed_id) integer_weights, status = do_integerizing( trace_label=trace_label, control_spec=control_spec, control_totals=seed_controls_df.loc[seed_id], incidence_table=seed_incidence[control_cols], float_weights=balanced_seed_weights, total_hh_control_col=total_hh_control_col) weight_list.append(integer_weights) # bulk concat all seed level results integer_seed_weights = pd.concat(weight_list) inject.add_column(weight_table_name(seed_geography), 'integer_weight', integer_seed_weights)
def sub_balancing(settings, crosswalk, control_spec, incidence_table): # geography is an injected model step arg geography = inject.get_step_arg('geography') crosswalk_df = crosswalk.to_frame() incidence_df = incidence_table.to_frame() control_spec = control_spec.to_frame() geographies = settings.get('geographies') seed_geography = settings.get('seed_geography') meta_geography = geographies[0] parent_geography = geographies[geographies.index(geography) - 1] sub_geographies = geographies[geographies.index(geography):] parent_geographies = geographies[:geographies.index(geography)] total_hh_control_col = settings.get('total_hh_control') sub_controls_df = get_control_table(geography) weights_df = get_weight_table(parent_geography) assert weights_df is not None integer_weights_list = [] seed_ids = crosswalk_df[seed_geography].unique() for seed_id in seed_ids: seed_incidence_df = incidence_df[incidence_df[seed_geography] == seed_id] seed_crosswalk_df = crosswalk_df[crosswalk_df[seed_geography] == seed_id] assert len(seed_crosswalk_df[meta_geography].unique()) == 1 parent_ids = seed_crosswalk_df[parent_geography].unique() for parent_id in parent_ids: logger.info("balancing seed %s, %s %s" % (seed_id, parent_geography, parent_id)) initial_weights = weights_df[weights_df[parent_geography] == parent_id] initial_weights = initial_weights.set_index( settings.get('household_id_col')) # using balanced_weight slows down simul and doesn't improve results # (float seeds means no zero-weight households to drop) if setting('SUB_BALANCE_WITH_FLOAT_SEED_WEIGHTS', True): initial_weights = initial_weights['balanced_weight'] else: initial_weights = initial_weights['integer_weight'] assert len(initial_weights.index) == len(seed_incidence_df.index) zone_weights_df = balance_and_integerize( incidence_df=seed_incidence_df, parent_weights=initial_weights, sub_controls_df=sub_controls_df, control_spec=control_spec, total_hh_control_col=total_hh_control_col, parent_geography=parent_geography, parent_id=parent_id, sub_geographies=sub_geographies, crosswalk_df=seed_crosswalk_df) # add higher level geography id columns to facilitate summaries parent_geography_ids = \ crosswalk_df.loc[crosswalk_df[parent_geography] == parent_id, parent_geographies]\ .max(axis=0) for z in parent_geography_ids.index: zone_weights_df[z] = parent_geography_ids[z] integer_weights_list.append(zone_weights_df) integer_weights_df = pd.concat(integer_weights_list) inject.add_table(weight_table_name(geography), integer_weights_df) inject.add_table( weight_table_name(geography, sparse=True), integer_weights_df[integer_weights_df['integer_weight'] > 0]) if 'trace_geography' in settings and geography in settings[ 'trace_geography']: trace_geography_id = settings.get('trace_geography')[geography] df = integer_weights_df[integer_weights_df[geography] == trace_geography_id] inject.add_table('trace_%s' % weight_table_name(geography), df)
def repop_balancing(settings, crosswalk, control_spec, incidence_table): crosswalk_df = crosswalk.to_frame() incidence_df = incidence_table.to_frame() control_spec = control_spec.to_frame() geographies = settings['geographies'] low_geography = geographies[-1] seed_geography = settings.get('seed_geography') seed_controls_df = get_control_table(seed_geography) all_seed_weights_df = get_weight_table(seed_geography) assert all_seed_weights_df is not None # only want control_spec rows for low_geography low_control_spec = control_spec[control_spec['geography'] == low_geography] low_controls_df = get_control_table(low_geography) household_id_col = setting('household_id_col') total_hh_control_col = setting('total_hh_control') max_expansion_factor = settings.get('max_expansion_factor', None) # run balancer for each low geography low_weight_list = [] seed_ids = crosswalk_df[seed_geography].unique() for seed_id in seed_ids: logger.info("initial_seed_balancing seed id %s" % seed_id) seed_incidence_df = incidence_df[incidence_df[seed_geography] == seed_id] seed_crosswalk_df = crosswalk_df[crosswalk_df[seed_geography] == seed_id] # initial seed weights in series indexed by hh id seed_weights_df = all_seed_weights_df[ all_seed_weights_df[seed_geography] == seed_id] seed_weights_df = seed_weights_df.set_index(household_id_col) # number of hh in seed zone (for scaling low zone weights) seed_zone_hh_count = seed_controls_df[total_hh_control_col].loc[ seed_id] low_ids = seed_crosswalk_df[low_geography].unique() for low_id in low_ids: trace_label = "%s_%s_%s_%s" % (seed_geography, seed_id, low_geography, low_id) logger.info("balance and integerize %s" % trace_label) # weights table for this zone with household_id index and low_geography column zone_weights_df = pd.DataFrame(index=seed_weights_df.index) zone_weights_df[low_geography] = low_id # scale seed weights by relative hh counts # it doesn't makes sense to repop balance with integer weights low_zone_hh_count = low_controls_df[total_hh_control_col].loc[ low_id] scaling_factor = float(low_zone_hh_count) / seed_zone_hh_count initial_weights = seed_weights_df[ 'balanced_weight'] * scaling_factor # - balance status, weights_df, controls_df = do_balancing( control_spec=low_control_spec, total_hh_control_col=total_hh_control_col, max_expansion_factor=max_expansion_factor, incidence_df=seed_incidence_df, control_totals=low_controls_df.loc[low_id], initial_weights=initial_weights) logger.info("repop_balancing balancing %s status: %s" % (trace_label, status)) if not status['converged']: raise RuntimeError("repop_balancing for %s did not converge" % trace_label) zone_weights_df['balanced_weight'] = weights_df['final'] # - integerize integer_weights, status = do_integerizing( trace_label=trace_label, control_spec=control_spec, control_totals=low_controls_df.loc[low_id], incidence_table=seed_incidence_df, float_weights=weights_df['final'], total_hh_control_col=total_hh_control_col) logger.info("repop_balancing integerizing status: %s" % status) zone_weights_df['integer_weight'] = integer_weights logger.info( "Total balanced weights for %s = %s" % (trace_label, zone_weights_df['balanced_weight'].sum())) logger.info("Total integerized weights for %s = %s" % (trace_label, zone_weights_df['integer_weight'].sum())) low_weight_list.append(zone_weights_df) # concat all low geography zone level results low_weights_df = pd.concat(low_weight_list).reset_index() # add higher level geography id columns to facilitate summaries crosswalk_df = crosswalk_df.set_index(low_geography)\ .loc[low_weights_df[low_geography]]\ .reset_index(drop=True) low_weights_df = pd.concat([low_weights_df, crosswalk_df], axis=1) inject.add_table(weight_table_name(low_geography), low_weights_df) inject.add_table(weight_table_name(low_geography, sparse=True), low_weights_df[low_weights_df['integer_weight'] > 0])
def meta_control_factoring(settings, control_spec, incidence_table): """ Apply simple factoring to summed household fractional weights based on original meta control values relative to summed household fractional weights by meta zone. The resulting factored meta control weights will be new meta controls, to be appended to the original controls, for final balancing. Parameters ---------- settings control_spec incidence_table Returns ------- """ # FIXME - if there is only one seed zone in the meta zone, just copy meta control values? incidence_df = incidence_table.to_frame() control_spec = control_spec.to_frame() geographies = settings.get('geographies') seed_geography = settings.get('seed_geography') meta_geography = geographies[0] # - if there are no meta controls, then we don't have to do anything if not (control_spec.geography == meta_geography).any(): logger.warn("meta_control_factoring: no meta targets so nothing to do") return meta_controls_df = get_control_table(meta_geography) dump_table("meta_controls_df", meta_controls_df) # slice control_spec to select only the rows for meta level controls meta_controls_spec = control_spec[control_spec.geography == meta_geography] meta_control_targets = meta_controls_spec['target'] logger.info("meta_control_factoring %s targets" % len(meta_control_targets)) dump_table("meta_controls_spec", meta_controls_spec) dump_table("meta_control_targets", meta_control_targets) # seed level weights of all households (rows aligned with incidence_df rows) seed_weights_df = get_weight_table(seed_geography) assert len(incidence_df.index) == len(seed_weights_df.index) # expand person weights by incidence (incidnece will simply be 1 for household targets) hh_level_weights = incidence_df[[seed_geography, meta_geography]].copy() for target in meta_control_targets: hh_level_weights[target] = \ incidence_df[target] * seed_weights_df['preliminary_balanced_weight'] dump_table("hh_level_weights", hh_level_weights) # weights of meta targets at seed level factored_seed_weights = \ hh_level_weights.groupby([seed_geography, meta_geography], as_index=False).sum() factored_seed_weights.set_index(seed_geography, inplace=True) dump_table("factored_seed_weights", factored_seed_weights) # weights of meta targets summed from seed level to meta level factored_meta_weights = factored_seed_weights.groupby(meta_geography, as_index=True).sum() dump_table("factored_meta_weights", factored_meta_weights) # only the meta level controls from meta_controls table meta_controls_df = meta_controls_df[meta_control_targets] dump_table("meta_controls_df", meta_controls_df) # compute the scaling factors to be applied to the seed-level totals: meta_factors = pd.DataFrame(index=meta_controls_df.index) for target in meta_control_targets: meta_factors[target] = meta_controls_df[target] / factored_meta_weights[target] dump_table("meta_factors", meta_factors) # compute seed-level controls from meta-level controls seed_level_meta_controls = pd.DataFrame(index=factored_seed_weights.index) for target in meta_control_targets: # meta level scaling_factor for this meta_control scaling_factor = factored_seed_weights[meta_geography].map(meta_factors[target]) # scale the seed_level_meta_controls by meta_level scaling_factor seed_level_meta_controls[target] = factored_seed_weights[target] * scaling_factor # FIXME - why round scaled factored seed_weights to int prior to final seed balancing? seed_level_meta_controls[target] = seed_level_meta_controls[target].round().astype(int) dump_table("seed_level_meta_controls", seed_level_meta_controls) # create final balancing controls # add newly created seed_level_meta_controls to the existing set of seed level controls seed_controls_df = get_control_table(seed_geography) assert len(seed_controls_df.index) == len(seed_level_meta_controls.index) seed_controls_df = pd.concat([seed_controls_df, seed_level_meta_controls], axis=1) # ensure columns are in right order for orca-extended table seed_controls_df = seed_controls_df[control_spec.target] assert (seed_controls_df.columns == control_spec.target).all() dump_table("seed_controls_df", seed_controls_df) pipeline.replace_table(control_table_name(seed_geography), seed_controls_df)
def initial_seed_balancing(settings, crosswalk, control_spec, incidence_table): crosswalk_df = crosswalk.to_frame() incidence_df = incidence_table.to_frame() control_spec = control_spec.to_frame() seed_geography = settings.get('seed_geography') seed_controls_df = get_control_table(seed_geography) # only want control_spec rows for sub_geographies geographies = settings['geographies'] sub_geographies = geographies[geographies.index(seed_geography) + 1:] seed_control_spec = control_spec[control_spec['geography'].isin( sub_geographies)] # determine master_control_index if specified in settings total_hh_control_col = settings.get('total_hh_control') max_expansion_factor = settings.get('max_expansion_factor', None) # run balancer for each seed geography weight_list = [] seed_ids = crosswalk_df[seed_geography].unique() for seed_id in seed_ids: logger.info("initial_seed_balancing seed id %s" % seed_id) seed_incidence_df = incidence_df[incidence_df[seed_geography] == seed_id] status, weights_df, controls_df = do_balancing( control_spec=seed_control_spec, total_hh_control_col=total_hh_control_col, max_expansion_factor=max_expansion_factor, incidence_df=seed_incidence_df, control_totals=seed_controls_df.loc[seed_id], initial_weights=seed_incidence_df['sample_weight']) logger.info("seed_balancer status: %s" % status) if not status['converged']: raise RuntimeError( "initial_seed_balancing for seed_id %s did not converge" % seed_id) balanced_weights = weights_df['final'] logger.info("Total balanced weights for seed %s = %s" % (seed_id, balanced_weights.sum())) weight_list.append(balanced_weights) # bulk concat all seed level results weights = pd.concat(weight_list) # build canonical weights table seed_weights_df = incidence_df[[seed_geography]].copy() seed_weights_df['preliminary_balanced_weight'] = weights # copy household_id_col index to named column seed_weights_df[setting('household_id_col')] = seed_weights_df.index # this is just a convenience if there are no meta controls if inject.get_step_arg('final', default=False): seed_weights_df['balanced_weight'] = seed_weights_df[ 'preliminary_balanced_weight'] inject.add_table(weight_table_name(seed_geography), seed_weights_df)