def population_generator(): # households_data_dirty, person_data_dirty = load_household_and_population_dfs() safe_mkdir(OUTPUT_DIR) pumas_to_go = set(sorted(puma_df_clean.values.tolist())) total_pumas = len(pumas_to_go) completed = [] for puma in puma_df_clean: gen_puma = 'state_{}_puma_{}_households.csv'.format(STATE, puma) if gen_puma in os.listdir(OUTPUT_DIR): completed.append(puma) pumas_to_go.remove(puma) print("Already completed {} of {} pumas: {}".format( len(completed), total_pumas, ",".join(sorted(completed)))) print("{} pumas remaining: {}".format(len(pumas_to_go), ",".join(sorted(pumas_to_go)))) puma_tract_mappings = 'input/sample_data/2010_puma_tract_mapping.txt' configuration = Configuration.from_file('input/sample_data/config.json') preprocessor = Preprocessor.from_config(configuration.preprocessing_config) households_raw_data = PumsData.from_csv( 'output/{}/household_pums_data.csv'.format(AOI_NAME)) persons_raw_data = PumsData.from_csv( 'output/{}/person_pums_data.csv'.format(AOI_NAME)) results = [ _generate_for_puma_data(households_raw_data, persons_raw_data, preprocessor, puma_tract_mappings, puma_id) for puma_id in pumas_to_go ] sum(results)
def test_binning_generator(self): bins = [0, 20000, 40000, 60000] config = {'test_input': {'bins': bins}} preprocessor = Preprocessor.from_config(config) preprocess = preprocessor.input_to_preprocessor['test_input'] self.assertEqual(preprocess(0), '<=0') self.assertEqual(preprocess(10000), '0-20000') self.assertEqual(preprocess(100000), '60000+')
def test_clean_data_one_field(self): pums_data = datasource.PumsData( pandas.DataFrame(self._mock_dirty_household_input())) cleaned = pums_data.clean([inputs.NUM_PEOPLE.name], Preprocessor()) actual = cleaned.data.loc[1].to_dict() expected = { inputs.NUM_PEOPLE.name: '2', } self.assertDictEqual(actual, expected)
def population_generator(): # households_data_dirty, person_data_dirty = load_household_and_population_dfs() pumas_to_go = set(puma_df_clean.values.tolist()) for puma in puma_df_clean: gen_puma = 'state_{}_puma_{}_households.csv'.format(STATE, puma) if gen_puma in os.listdir(output_dir): print(puma) pumas_to_go.remove(puma) puma_tract_mappings = 'input/2010_puma_tract_mapping.txt' configuration = Configuration.from_file('input/config.json') preprocessor = Preprocessor.from_config(configuration.preprocessing_config) for puma_id in pumas_to_go: households_data = PumsData.from_csv('input/{}_household_pums_data.csv'.format(AOI_NAME)).clean(household_fields, preprocessor, state=str( int(STATE)), puma=str(int( puma_id))) persons_data = PumsData.from_csv('input/{}_person_pums_data.csv'.format(AOI_NAME)).clean(persons_fields, preprocessor, state=str(int(STATE)), puma=str(int(puma_id))) person_segmenter = lambda x: None household_segmenter = lambda x: None print("loaded") household_model, person_model = create_bayes_net( STATE, puma_id, output_dir, households_data, persons_data, configuration, person_segmenter, household_segmenter ) marginals, allocator = download_tract_data( STATE, puma_id, output_dir, census_api_key, puma_tract_mappings, households_data, persons_data ) print('Allocated {}'.format(puma_id)) population = generate_synthetic_people_and_households( STATE, puma_id, output_dir, allocator, person_model, household_model ) print('Generated {}'.format(puma_id)) accuracy = Accuracy.from_doppelganger( cleaned_data_persons=persons_data, cleaned_data_households=households_data, marginal_data=marginals, population=population ) logging.info('Absolute Percent Error for state {}, and puma {}: {}'.format(STATE, puma_id, accuracy.absolute_pct_error().mean())) _combine_and_synthesize()
def test_clean_data_filter_length(self): pums_data = datasource.PumsData( pandas.DataFrame(self._mock_dirty_household_puma_state_input())) field_names = [ inputs.SERIAL_NUMBER.name, inputs.STATE.name, inputs.PUMA.name ] cleaned = pums_data.clean(field_names, Preprocessor()) cleaned_state = pums_data.clean(field_names, Preprocessor(), state='06') cleaned_puma = pums_data.clean(field_names, Preprocessor(), puma='00106') cleaned_both = pums_data.clean(field_names, Preprocessor(), state='06', puma='00106') self.assertEqual(len(cleaned.data), 3) self.assertEqual(len(cleaned_state.data), 2) self.assertEqual(len(cleaned_puma.data), 2) self.assertEqual(len(cleaned_both.data), 1)
def test_prior_creation(self): all_values = bayesnets.generate_laplace_prior_data( (inputs.AGE.name, inputs.SEX.name), Preprocessor()) expected = { ('0-17', 'M'), ('18-34', 'M'), ('35-64', 'M'), ('65+', 'M'), ('0-17', 'F'), ('18-34', 'F'), ('35-64', 'F'), ('65+', 'F'), } self.assertSetEqual(expected, all_values)
def test_clean_data(self): pums_data = datasource.PumsData( pandas.DataFrame(self._mock_dirty_household_input())) cleaned = pums_data.clean([ inputs.SERIAL_NUMBER.name, inputs.NUM_PEOPLE.name, inputs.NUM_VEHICLES.name, inputs.HOUSEHOLD_INCOME.name ], Preprocessor()) actual = cleaned.data.loc[1].to_dict() expected = { inputs.SERIAL_NUMBER.name: 'b', inputs.NUM_PEOPLE.name: '2', inputs.NUM_VEHICLES.name: '3+', inputs.HOUSEHOLD_INCOME.name: '40000+' } self.assertDictEqual(actual, expected)
def fetch_pums_data(state_id, puma_id, configuration, db_host, db_database, db_schema, db_user, db_password): '''Download PUMS data from pums tables stored in a database Args: state_id: 2-digit state fips code puma_id: 5-digit puma code configuration: contains person and household fields, along with how to instruct the preprocessor to discretize the fields db_host: hostname of the POSTGRESQL instance to connect to db_database: database name to connect to db_schema: schema which _must_ contain a person and household table with pums fields referenced in doppelganger/inputs.py db_user: username to connect with db_password: password to authenticate to the database Returns: person_data: a PumsData wrapped dataframe whose fields have been mapped according to inputs.py households_data: same as person_data but for households ''' preprocessor = Preprocessor.from_config(configuration.preprocessing_config) # Union default and extra fields person_fields = link_fields_to_inputs(configuration.person_fields) person_fields = allocation.DEFAULT_PERSON_FIELDS.union(person_fields) person_fieldnames = tuple(set(p.name for p in person_fields)) household_fields = link_fields_to_inputs(configuration.household_fields) household_fields = allocation.DEFAULT_HOUSEHOLD_FIELDS.union( household_fields) household_fieldnames = tuple(set(hh.name for hh in household_fields)) puma_conn = None try: puma_conn = psycopg2.connect( host=db_host, database=db_database, user=db_user, password=db_password, ) households_data = datasource.PumsData.from_database( conn=puma_conn, state_id=state_id, puma_id=puma_id, schema_name=db_schema, table_name=HOUSEHOLD_TABLE, fields=household_fields).clean(field_names=household_fieldnames, preprocessor=preprocessor, state=state_id, puma=puma_id) persons_data = datasource.PumsData.from_database( conn=puma_conn, state_id=state_id, puma_id=puma_id, schema_name=db_schema, table_name=PERSONS_TABLE, fields=person_fields).clean(field_names=person_fieldnames, preprocessor=preprocessor, state=state_id, puma=puma_id) except psycopg2.DatabaseError as error: print(error) finally: if puma_conn is not None: puma_conn.close() print('Database connection closed.') return households_data, persons_data