Beispiel #1
0
    def test_error_report(self, mock_comparison_datframe, mock_from_data_dir):
        accuracy = Accuracy(Mock(), Mock(), Mock(), Mock(), Mock(), Mock(),
                            Mock())
        accuracy.comparison_dataframe = self._mock_comparison_dataframe()
        accuracy.from_data_dir.return_value = accuracy

        state_puma = dict()
        state_puma['20'] = ['00500', '00602', '00604']
        state_puma['29'] = ['00901', '00902']

        expected_columns = ['marginal-pums', 'marginal-doppelganger']

        df_puma, df_variable, df_total =\
            accuracy.error_report(
                    state_puma, 'fake_dir',
                    marginal_variables=['num_people', 'num_vehicles', 'age'],
                    statistic=ErrorStat.ABSOLUTE_PCT_ERROR
                    )

        # Test df_total
        df_total_expected = pd.Series([2.00000, 0.666667],
                                      index=expected_columns)
        self.assertTrue(all((df_total - df_total_expected) < 1))

        # Test df_puma
        expected_puma_data = np.reshape([2.0, 2 / 3.0] * 5, (5, 2))
        df_expected_puma = pd.DataFrame(data=expected_puma_data,
                                        index=self._mock_state_puma(),
                                        columns=expected_columns)
        self.assertTrue((df_expected_puma == df_puma).all().all())

        # Test df_variable
        expected_variable_data = np.reshape([2.0, 2 / 3.0] * 12, (12, 2))
        df_expected_variable = pd.DataFrame(data=expected_variable_data,
                                            index=self._mock_variable_bins(),
                                            columns=expected_columns)
        self.assertTrue((df_expected_variable == df_variable).all().all())

        # Test unimplemented statistic name
        try:
            self.assertRaises(
                Exception,
                Accuracy.error_report(
                    state_puma,
                    'fake_dir',
                    marginal_variables=['num_people', 'num_vehicles', 'age'],
                    statistic='wrong-statistic-name'))
        except Exception:
            pass
def population_generator():
    # households_data_dirty, person_data_dirty = load_household_and_population_dfs()
    pumas_to_go = set(puma_df_clean.values.tolist())

    for puma in puma_df_clean:
        gen_puma = 'state_{}_puma_{}_households.csv'.format(STATE, puma)
        if gen_puma in os.listdir(output_dir):
            print(puma)
            pumas_to_go.remove(puma)

    puma_tract_mappings = 'input/2010_puma_tract_mapping.txt'
    configuration = Configuration.from_file('input/config.json')
    preprocessor = Preprocessor.from_config(configuration.preprocessing_config)

    for puma_id in pumas_to_go:
        households_data = PumsData.from_csv('input/{}_household_pums_data.csv'.format(AOI_NAME)).clean(household_fields,
                                                                                                       preprocessor,
                                                                                                       state=str(
                                                                                                           int(STATE)),
                                                                                                       puma=str(int(
                                                                                                           puma_id)))
        persons_data = PumsData.from_csv('input/{}_person_pums_data.csv'.format(AOI_NAME)).clean(persons_fields,
                                                                                                 preprocessor,
                                                                                                 state=str(int(STATE)),
                                                                                                 puma=str(int(puma_id)))
        person_segmenter = lambda x: None
        household_segmenter = lambda x: None
        print("loaded")

        household_model, person_model = create_bayes_net(
            STATE, puma_id, output_dir,
            households_data, persons_data, configuration,
            person_segmenter, household_segmenter
        )

        marginals, allocator = download_tract_data(
            STATE, puma_id, output_dir, census_api_key, puma_tract_mappings,
            households_data, persons_data
        )

        print('Allocated {}'.format(puma_id))
        population = generate_synthetic_people_and_households(
            STATE, puma_id, output_dir, allocator,
            person_model, household_model
        )

        print('Generated {}'.format(puma_id))
        accuracy = Accuracy.from_doppelganger(
            cleaned_data_persons=persons_data,
            cleaned_data_households=households_data,
            marginal_data=marginals,
            population=population
        )

        logging.info('Absolute Percent Error for state {}, and puma {}: {}'.format(STATE, puma_id,
                                                                                   accuracy.absolute_pct_error().mean()))
    _combine_and_synthesize()
Beispiel #3
0
 def test_error_metrics(self, mock_comparison_dataframe):
     accuracy = Accuracy(Mock(), Mock(), Mock(), Mock(), Mock(), Mock(),
                         Mock())
     accuracy.comparison_dataframe = self._mock_comparison_dataframe()
     self.assertEqual(accuracy.root_mean_squared_error(), (1.0, 1.0))
     self.assertListEqual(accuracy.root_squared_error().mean().tolist(),
                          [1.0, 1.0])
     self.assertListEqual(accuracy.absolute_pct_error().mean().tolist(),
                          [2.0, 0.66666666666666663])
Beispiel #4
0
def _generate_for_puma_data(households_raw_data, persons_raw_data,
                            preprocessor, puma_tract_mappings, puma_id):
    households_data = households_raw_data.clean(household_fields,
                                                preprocessor,
                                                state=str(int(STATE)),
                                                puma=str(int(puma_id)))
    persons_data = persons_raw_data.clean(persons_fields,
                                          preprocessor,
                                          state=str(int(STATE)),
                                          puma=str(int(puma_id)))
    person_segmenter = lambda x: None
    household_segmenter = lambda x: None

    print("{} input data loaded. Starting allocation/generation.".format(
        puma_id))

    household_model, person_model = create_bayes_net(
        STATE, puma_id, OUTPUT_DIR, households_data, persons_data,
        configuration, person_segmenter, household_segmenter)

    marginals, allocator = download_tract_data(STATE, puma_id, OUTPUT_DIR,
                                               census_api_key,
                                               puma_tract_mappings,
                                               households_data, persons_data)

    print('Allocated {}'.format(puma_id))

    population = generate_synthetic_people_and_households(
        STATE, puma_id, OUTPUT_DIR, allocator, person_model, household_model)

    print('Generated synthetic people and households for {}'.format(puma_id))

    accuracy = Accuracy.from_doppelganger(
        cleaned_data_persons=persons_data,
        cleaned_data_households=households_data,
        marginal_data=marginals,
        population=population)

    print('Absolute Percent Error for state {}, and puma {}: {}'.format(
        STATE, puma_id,
        accuracy.absolute_pct_error().mean()))
    return True
Beispiel #5
0
def main():
    args = parse_args()
    puma_tract_mappings = args.puma_tract_mappings_csv
    state_id = args.state_id
    puma_id = args.puma_id
    census_api_key = args.census_api_key
    config_file = args.config_file
    output_dir = args.output_dir
    db_host = args.db_host
    db_database = args.db_database
    db_schema = args.db_schema
    db_user = args.db_user
    db_password = args.db_password

    configuration = Configuration.from_file(config_file)

    households_data, persons_data = download_and_load_pums_data(
        output_dir, state_id, puma_id, configuration, db_host, db_database,
        db_schema, db_user, db_password)

    household_model, person_model = create_bayes_net(
        state_id, puma_id, output_dir, households_data, persons_data,
        configuration, person_segmenter, household_segmenter)

    marginals, allocator = download_tract_data(state_id, puma_id, output_dir,
                                               census_api_key,
                                               puma_tract_mappings,
                                               households_data, persons_data)

    population = generate_synthetic_people_and_households(
        state_id, puma_id, output_dir, allocator, person_model,
        household_model)

    accuracy = Accuracy.from_doppelganger(
        cleaned_data_persons=persons_data,
        cleaned_data_households=households_data,
        marginal_data=marginals,
        population=population)
    logging.info('Absolute Percent Error for state {}, and puma {}: {}'.format(
        state_id, puma_id,
        accuracy.absolute_pct_error().mean()))