def add_age_categories(df, year): zfi = zone_forecast_inputs() rc = regional_controls() seed_matrix = zfi[["sh_age0004", "sh_age0519", "sh_age2044", "sh_age4564", "sh_age65p"]].\ mul(df.totpop, axis='index').as_matrix() row_marginals = df.totpop.values agecols = ["age0004", "age0519", "age2044", "age4564", "age65p"] col_marginals = rc[agecols].loc[year].values target = df.totpop.sum() col_marginals = scale_by_target(pd.Series(col_marginals), target).round().astype('int') seed_matrix[seed_matrix == 0] = .1 seed_matrix[row_marginals == 0, :] = 0 mat = simple_ipf(seed_matrix, col_marginals, row_marginals) agedf = pd.DataFrame(mat) agedf.columns = [col.upper() for col in agecols] agedf.index = zfi.index for ind, row in agedf.iterrows(): target = df.totpop.loc[ind] row = row.round() agedf.loc[ind] = round_series_match_target(row, target, 0) for col in agedf.columns: df[col] = agedf[col] return df
def add_employment(df, year): hhs_by_inc = df[["hhincq1", "hhincq2", "hhincq3", "hhincq4"]] hh_shares = hhs_by_inc.divide(hhs_by_inc.sum(axis=1), axis="index") zfi = zone_forecast_inputs() empshare = 0.46381 * hh_shares.hhincq1 + 0.49361 * hh_shares.hhincq2 +\ 0.56938 * hh_shares.hhincq3 + 0.29818 * hh_shares.hhincq4 +\ zfi.zonal_emp_sh_resid10 # I really don't think more than 70% of people should be employed # in a given zone - this also makes sure that the employed residents # is less then the total population (after scaling) - if the # assertion below is triggered you can fix it by reducing this # .7 even a little bit more empshare = empshare.fillna(0).clip(.3, .7) empres = empshare * df.totpop rc = regional_controls() target = rc.empres.loc[year] empres = scale_by_target(empres, target) df["empres"] = round_series_match_target(empres, target, 0) # this should really make the assertion below pass, but this now # only occurs very infrequently df["empres"] = df[["empres", "totpop"]].min(axis=1) # make sure employed residents is less than total residents assert (df.empres <= df.totpop).all() return df
def _proportional_jobs_model( target_ratio, # ratio of jobs of this sector to households sector, # empsix sector groupby_col, # ratio will be matched at this level of geog hh_df, jobs_df, locations_series, target_jobs=None # pass this if you want to compute target jobs ): if target_jobs is None: # compute it if not passed target_jobs = hh_df[groupby_col].value_counts() * target_ratio target_jobs = target_jobs.astype('int') current_jobs = jobs_df[ jobs_df.empsix == sector][groupby_col].value_counts() need_more_jobs = target_jobs - current_jobs need_more_jobs = need_more_jobs[need_more_jobs > 0] need_more_jobs_total = int(need_more_jobs.sum()) available_jobs = \ jobs_df.query("empsix == '%s' and building_id == -1" % sector) print "Need more jobs total: %d" % need_more_jobs_total print "Available jobs: %d" % len(available_jobs) if len(available_jobs) == 0: # corner case return pd.Series() if len(available_jobs) >= need_more_jobs_total: # have enough jobs to assign, truncate available jobs available_jobs = available_jobs.head(need_more_jobs_total) else: # don't have enough jobs - random sample locations to partially # match the need (won't succed matching the entire need) need_more_jobs = round_series_match_target( need_more_jobs, len(available_jobs), 0) need_more_jobs_total = need_more_jobs.sum() assert need_more_jobs_total == len(available_jobs) if need_more_jobs_total <= 0: return pd.Series() print "Need more jobs\n", need_more_jobs choices = groupby_random_choice(locations_series, need_more_jobs) # choose random locations within jurises to match need_more_jobs totals return pd.Series(choices.index, available_jobs.index)
def add_population(df, year): rc = regional_controls() target = rc.totpop.loc[year] - df.gqpop.sum() zfi = zone_forecast_inputs() s = df.tothh * zfi.meanhhsize s = scale_by_target(s, target, .15) df["hhpop"] = round_series_match_target(s, target, 0) df["hhpop"] = df.hhpop.fillna(0) return df
def add_age_categories(df, year): zfi = zone_forecast_inputs() rc = regional_controls() # Added [df.index.isin(zfi.index)] to prevent errors on non-matching data seed_matrix = zfi[["sh_age0004", "sh_age0519", "sh_age2044", "sh_age4564", "sh_age65p"]].\ mul(df[df.index.isin(zfi.index)].totpop, axis='index').as_matrix() row_marginals = df.totpop.values agecols = ["age0004", "age0519", "age2044", "age4564", "age65p"] col_marginals = rc[agecols].loc[year].values target = df.totpop.sum() col_marginals = scale_by_target(pd.Series(col_marginals), target).round().astype('int') seed_matrix[seed_matrix == 0] = .1 seed_matrix[row_marginals == 0, :] = 0 # Added by Derek to make sure the seed_matrix is a matrix, rather than a # ndarray, which caused problems in simple_ipf seed_matrix = pd.DataFrame(seed_matrix) mat = simple_ipf(seed_matrix, col_marginals, row_marginals) agedf = pd.DataFrame(mat) agedf.columns = [col.upper() for col in agecols] agedf.index = zfi.index for ind, row in agedf.iterrows(): target = df.totpop.loc[ind] row = row.round() agedf.loc[ind] = round_series_match_target(row, target, 0) for col in agedf.columns: df[col] = agedf[col] return df
def _proportional_jobs_model( target_ratio, # ratio of jobs of this sector to households sector, # empsix sector groupby_col, # ratio will be matched at this level of geog hh_df, jobs_df, locations_series, target_jobs=None # pass this if you want to compute target jobs ): if target_jobs is None: # compute it if not passed target_jobs = hh_df[groupby_col].value_counts() * target_ratio target_jobs = target_jobs.astype('int') current_jobs = jobs_df[ jobs_df.empsix == sector][groupby_col].value_counts() need_more_jobs = target_jobs - current_jobs need_more_jobs = need_more_jobs[need_more_jobs > 0] need_more_jobs_total = int(need_more_jobs.sum()) available_jobs = \ jobs_df.query("empsix == '%s' and building_id == -1" % sector) print "Need more jobs total: %d" % need_more_jobs_total print "Available jobs: %d" % len(available_jobs) if len(available_jobs) == 0: # corner case return pd.Series() if len(available_jobs) >= need_more_jobs_total: # have enough jobs to assign, truncate available jobs available_jobs = available_jobs.head(need_more_jobs_total) else: # don't have enough jobs - random sample locations to partially # match the need (won't succed matching the entire need) need_more_jobs = round_series_match_target( need_more_jobs, len(available_jobs), 0) need_more_jobs_total = need_more_jobs.sum() assert need_more_jobs_total == len(available_jobs) if need_more_jobs_total <= 0: return pd.Series() print "Need more jobs\n", need_more_jobs excess = need_more_jobs.sub(locations_series.value_counts(), fill_value=0) print "Excess demand\n", excess[excess > 0] # there's an issue with groupby_random_choice where it can't choose from # a set of locations that don't exist - e.g. we have 2 jobs in a certain # city but not locations to put them in. we need to drop this demand drop = need_more_jobs.index.difference(locations_series.unique()) print "We don't have any locations for these locations:\n", drop need_more_jobs = need_more_jobs.drop(drop) # choose random locations within jurises to match need_more_jobs totals choices = groupby_random_choice(locations_series, need_more_jobs, replace=True) # these might not be the same length after dropping a few lines above available_jobs = available_jobs.head(len(choices)) return pd.Series(choices.index, available_jobs.index)