def population_generator(): # households_data_dirty, person_data_dirty = load_household_and_population_dfs() pumas_to_go = set(puma_df_clean.values.tolist()) for puma in puma_df_clean: gen_puma = 'state_{}_puma_{}_households.csv'.format(STATE, puma) if gen_puma in os.listdir(output_dir): print(puma) pumas_to_go.remove(puma) puma_tract_mappings = 'input/2010_puma_tract_mapping.txt' configuration = Configuration.from_file('input/config.json') preprocessor = Preprocessor.from_config(configuration.preprocessing_config) for puma_id in pumas_to_go: households_data = PumsData.from_csv('input/{}_household_pums_data.csv'.format(AOI_NAME)).clean(household_fields, preprocessor, state=str( int(STATE)), puma=str(int( puma_id))) persons_data = PumsData.from_csv('input/{}_person_pums_data.csv'.format(AOI_NAME)).clean(persons_fields, preprocessor, state=str(int(STATE)), puma=str(int(puma_id))) person_segmenter = lambda x: None household_segmenter = lambda x: None print("loaded") household_model, person_model = create_bayes_net( STATE, puma_id, output_dir, households_data, persons_data, configuration, person_segmenter, household_segmenter ) marginals, allocator = download_tract_data( STATE, puma_id, output_dir, census_api_key, puma_tract_mappings, households_data, persons_data ) print('Allocated {}'.format(puma_id)) population = generate_synthetic_people_and_households( STATE, puma_id, output_dir, allocator, person_model, household_model ) print('Generated {}'.format(puma_id)) accuracy = Accuracy.from_doppelganger( cleaned_data_persons=persons_data, cleaned_data_households=households_data, marginal_data=marginals, population=population ) logging.info('Absolute Percent Error for state {}, and puma {}: {}'.format(STATE, puma_id, accuracy.absolute_pct_error().mean())) _combine_and_synthesize()
def _generate_for_puma_data(households_raw_data, persons_raw_data, preprocessor, puma_tract_mappings, puma_id): households_data = households_raw_data.clean(household_fields, preprocessor, state=str(int(STATE)), puma=str(int(puma_id))) persons_data = persons_raw_data.clean(persons_fields, preprocessor, state=str(int(STATE)), puma=str(int(puma_id))) person_segmenter = lambda x: None household_segmenter = lambda x: None print("{} input data loaded. Starting allocation/generation.".format( puma_id)) household_model, person_model = create_bayes_net( STATE, puma_id, OUTPUT_DIR, households_data, persons_data, configuration, person_segmenter, household_segmenter) marginals, allocator = download_tract_data(STATE, puma_id, OUTPUT_DIR, census_api_key, puma_tract_mappings, households_data, persons_data) print('Allocated {}'.format(puma_id)) population = generate_synthetic_people_and_households( STATE, puma_id, OUTPUT_DIR, allocator, person_model, household_model) print('Generated synthetic people and households for {}'.format(puma_id)) accuracy = Accuracy.from_doppelganger( cleaned_data_persons=persons_data, cleaned_data_households=households_data, marginal_data=marginals, population=population) print('Absolute Percent Error for state {}, and puma {}: {}'.format( STATE, puma_id, accuracy.absolute_pct_error().mean())) return True
def main(): args = parse_args() puma_tract_mappings = args.puma_tract_mappings_csv state_id = args.state_id puma_id = args.puma_id census_api_key = args.census_api_key config_file = args.config_file output_dir = args.output_dir db_host = args.db_host db_database = args.db_database db_schema = args.db_schema db_user = args.db_user db_password = args.db_password configuration = Configuration.from_file(config_file) households_data, persons_data = download_and_load_pums_data( output_dir, state_id, puma_id, configuration, db_host, db_database, db_schema, db_user, db_password) household_model, person_model = create_bayes_net( state_id, puma_id, output_dir, households_data, persons_data, configuration, person_segmenter, household_segmenter) marginals, allocator = download_tract_data(state_id, puma_id, output_dir, census_api_key, puma_tract_mappings, households_data, persons_data) population = generate_synthetic_people_and_households( state_id, puma_id, output_dir, allocator, person_model, household_model) accuracy = Accuracy.from_doppelganger( cleaned_data_persons=persons_data, cleaned_data_households=households_data, marginal_data=marginals, population=population) logging.info('Absolute Percent Error for state {}, and puma {}: {}'.format( state_id, puma_id, accuracy.absolute_pct_error().mean()))