def test_error_report(self, mock_comparison_datframe, mock_from_data_dir): accuracy = Accuracy(Mock(), Mock(), Mock(), Mock(), Mock(), Mock(), Mock()) accuracy.comparison_dataframe = self._mock_comparison_dataframe() accuracy.from_data_dir.return_value = accuracy state_puma = dict() state_puma['20'] = ['00500', '00602', '00604'] state_puma['29'] = ['00901', '00902'] expected_columns = ['marginal-pums', 'marginal-doppelganger'] df_puma, df_variable, df_total =\ accuracy.error_report( state_puma, 'fake_dir', marginal_variables=['num_people', 'num_vehicles', 'age'], statistic=ErrorStat.ABSOLUTE_PCT_ERROR ) # Test df_total df_total_expected = pd.Series([2.00000, 0.666667], index=expected_columns) self.assertTrue(all((df_total - df_total_expected) < 1)) # Test df_puma expected_puma_data = np.reshape([2.0, 2 / 3.0] * 5, (5, 2)) df_expected_puma = pd.DataFrame(data=expected_puma_data, index=self._mock_state_puma(), columns=expected_columns) self.assertTrue((df_expected_puma == df_puma).all().all()) # Test df_variable expected_variable_data = np.reshape([2.0, 2 / 3.0] * 12, (12, 2)) df_expected_variable = pd.DataFrame(data=expected_variable_data, index=self._mock_variable_bins(), columns=expected_columns) self.assertTrue((df_expected_variable == df_variable).all().all()) # Test unimplemented statistic name try: self.assertRaises( Exception, Accuracy.error_report( state_puma, 'fake_dir', marginal_variables=['num_people', 'num_vehicles', 'age'], statistic='wrong-statistic-name')) except Exception: pass
def population_generator(): # households_data_dirty, person_data_dirty = load_household_and_population_dfs() pumas_to_go = set(puma_df_clean.values.tolist()) for puma in puma_df_clean: gen_puma = 'state_{}_puma_{}_households.csv'.format(STATE, puma) if gen_puma in os.listdir(output_dir): print(puma) pumas_to_go.remove(puma) puma_tract_mappings = 'input/2010_puma_tract_mapping.txt' configuration = Configuration.from_file('input/config.json') preprocessor = Preprocessor.from_config(configuration.preprocessing_config) for puma_id in pumas_to_go: households_data = PumsData.from_csv('input/{}_household_pums_data.csv'.format(AOI_NAME)).clean(household_fields, preprocessor, state=str( int(STATE)), puma=str(int( puma_id))) persons_data = PumsData.from_csv('input/{}_person_pums_data.csv'.format(AOI_NAME)).clean(persons_fields, preprocessor, state=str(int(STATE)), puma=str(int(puma_id))) person_segmenter = lambda x: None household_segmenter = lambda x: None print("loaded") household_model, person_model = create_bayes_net( STATE, puma_id, output_dir, households_data, persons_data, configuration, person_segmenter, household_segmenter ) marginals, allocator = download_tract_data( STATE, puma_id, output_dir, census_api_key, puma_tract_mappings, households_data, persons_data ) print('Allocated {}'.format(puma_id)) population = generate_synthetic_people_and_households( STATE, puma_id, output_dir, allocator, person_model, household_model ) print('Generated {}'.format(puma_id)) accuracy = Accuracy.from_doppelganger( cleaned_data_persons=persons_data, cleaned_data_households=households_data, marginal_data=marginals, population=population ) logging.info('Absolute Percent Error for state {}, and puma {}: {}'.format(STATE, puma_id, accuracy.absolute_pct_error().mean())) _combine_and_synthesize()
def test_error_metrics(self, mock_comparison_dataframe): accuracy = Accuracy(Mock(), Mock(), Mock(), Mock(), Mock(), Mock(), Mock()) accuracy.comparison_dataframe = self._mock_comparison_dataframe() self.assertEqual(accuracy.root_mean_squared_error(), (1.0, 1.0)) self.assertListEqual(accuracy.root_squared_error().mean().tolist(), [1.0, 1.0]) self.assertListEqual(accuracy.absolute_pct_error().mean().tolist(), [2.0, 0.66666666666666663])
def _generate_for_puma_data(households_raw_data, persons_raw_data, preprocessor, puma_tract_mappings, puma_id): households_data = households_raw_data.clean(household_fields, preprocessor, state=str(int(STATE)), puma=str(int(puma_id))) persons_data = persons_raw_data.clean(persons_fields, preprocessor, state=str(int(STATE)), puma=str(int(puma_id))) person_segmenter = lambda x: None household_segmenter = lambda x: None print("{} input data loaded. Starting allocation/generation.".format( puma_id)) household_model, person_model = create_bayes_net( STATE, puma_id, OUTPUT_DIR, households_data, persons_data, configuration, person_segmenter, household_segmenter) marginals, allocator = download_tract_data(STATE, puma_id, OUTPUT_DIR, census_api_key, puma_tract_mappings, households_data, persons_data) print('Allocated {}'.format(puma_id)) population = generate_synthetic_people_and_households( STATE, puma_id, OUTPUT_DIR, allocator, person_model, household_model) print('Generated synthetic people and households for {}'.format(puma_id)) accuracy = Accuracy.from_doppelganger( cleaned_data_persons=persons_data, cleaned_data_households=households_data, marginal_data=marginals, population=population) print('Absolute Percent Error for state {}, and puma {}: {}'.format( STATE, puma_id, accuracy.absolute_pct_error().mean())) return True
def main(): args = parse_args() puma_tract_mappings = args.puma_tract_mappings_csv state_id = args.state_id puma_id = args.puma_id census_api_key = args.census_api_key config_file = args.config_file output_dir = args.output_dir db_host = args.db_host db_database = args.db_database db_schema = args.db_schema db_user = args.db_user db_password = args.db_password configuration = Configuration.from_file(config_file) households_data, persons_data = download_and_load_pums_data( output_dir, state_id, puma_id, configuration, db_host, db_database, db_schema, db_user, db_password) household_model, person_model = create_bayes_net( state_id, puma_id, output_dir, households_data, persons_data, configuration, person_segmenter, household_segmenter) marginals, allocator = download_tract_data(state_id, puma_id, output_dir, census_api_key, puma_tract_mappings, households_data, persons_data) population = generate_synthetic_people_and_households( state_id, puma_id, output_dir, allocator, person_model, household_model) accuracy = Accuracy.from_doppelganger( cleaned_data_persons=persons_data, cleaned_data_households=households_data, marginal_data=marginals, population=population) logging.info('Absolute Percent Error for state {}, and puma {}: {}'.format( state_id, puma_id, accuracy.absolute_pct_error().mean()))