def population_generator():
    # households_data_dirty, person_data_dirty = load_household_and_population_dfs()
    pumas_to_go = set(puma_df_clean.values.tolist())

    for puma in puma_df_clean:
        gen_puma = 'state_{}_puma_{}_households.csv'.format(STATE, puma)
        if gen_puma in os.listdir(output_dir):
            print(puma)
            pumas_to_go.remove(puma)

    puma_tract_mappings = 'input/2010_puma_tract_mapping.txt'
    configuration = Configuration.from_file('input/config.json')
    preprocessor = Preprocessor.from_config(configuration.preprocessing_config)

    for puma_id in pumas_to_go:
        households_data = PumsData.from_csv('input/{}_household_pums_data.csv'.format(AOI_NAME)).clean(household_fields,
                                                                                                       preprocessor,
                                                                                                       state=str(
                                                                                                           int(STATE)),
                                                                                                       puma=str(int(
                                                                                                           puma_id)))
        persons_data = PumsData.from_csv('input/{}_person_pums_data.csv'.format(AOI_NAME)).clean(persons_fields,
                                                                                                 preprocessor,
                                                                                                 state=str(int(STATE)),
                                                                                                 puma=str(int(puma_id)))
        person_segmenter = lambda x: None
        household_segmenter = lambda x: None
        print("loaded")

        household_model, person_model = create_bayes_net(
            STATE, puma_id, output_dir,
            households_data, persons_data, configuration,
            person_segmenter, household_segmenter
        )

        marginals, allocator = download_tract_data(
            STATE, puma_id, output_dir, census_api_key, puma_tract_mappings,
            households_data, persons_data
        )

        print('Allocated {}'.format(puma_id))
        population = generate_synthetic_people_and_households(
            STATE, puma_id, output_dir, allocator,
            person_model, household_model
        )

        print('Generated {}'.format(puma_id))
        accuracy = Accuracy.from_doppelganger(
            cleaned_data_persons=persons_data,
            cleaned_data_households=households_data,
            marginal_data=marginals,
            population=population
        )

        logging.info('Absolute Percent Error for state {}, and puma {}: {}'.format(STATE, puma_id,
                                                                                   accuracy.absolute_pct_error().mean()))
    _combine_and_synthesize()
Exemple #2
0
def _generate_for_puma_data(households_raw_data, persons_raw_data,
                            preprocessor, puma_tract_mappings, puma_id):
    households_data = households_raw_data.clean(household_fields,
                                                preprocessor,
                                                state=str(int(STATE)),
                                                puma=str(int(puma_id)))
    persons_data = persons_raw_data.clean(persons_fields,
                                          preprocessor,
                                          state=str(int(STATE)),
                                          puma=str(int(puma_id)))
    person_segmenter = lambda x: None
    household_segmenter = lambda x: None

    print("{} input data loaded. Starting allocation/generation.".format(
        puma_id))

    household_model, person_model = create_bayes_net(
        STATE, puma_id, OUTPUT_DIR, households_data, persons_data,
        configuration, person_segmenter, household_segmenter)

    marginals, allocator = download_tract_data(STATE, puma_id, OUTPUT_DIR,
                                               census_api_key,
                                               puma_tract_mappings,
                                               households_data, persons_data)

    print('Allocated {}'.format(puma_id))

    population = generate_synthetic_people_and_households(
        STATE, puma_id, OUTPUT_DIR, allocator, person_model, household_model)

    print('Generated synthetic people and households for {}'.format(puma_id))

    accuracy = Accuracy.from_doppelganger(
        cleaned_data_persons=persons_data,
        cleaned_data_households=households_data,
        marginal_data=marginals,
        population=population)

    print('Absolute Percent Error for state {}, and puma {}: {}'.format(
        STATE, puma_id,
        accuracy.absolute_pct_error().mean()))
    return True
Exemple #3
0
def main():
    args = parse_args()
    puma_tract_mappings = args.puma_tract_mappings_csv
    state_id = args.state_id
    puma_id = args.puma_id
    census_api_key = args.census_api_key
    config_file = args.config_file
    output_dir = args.output_dir
    db_host = args.db_host
    db_database = args.db_database
    db_schema = args.db_schema
    db_user = args.db_user
    db_password = args.db_password

    configuration = Configuration.from_file(config_file)

    households_data, persons_data = download_and_load_pums_data(
        output_dir, state_id, puma_id, configuration, db_host, db_database,
        db_schema, db_user, db_password)

    household_model, person_model = create_bayes_net(
        state_id, puma_id, output_dir, households_data, persons_data,
        configuration, person_segmenter, household_segmenter)

    marginals, allocator = download_tract_data(state_id, puma_id, output_dir,
                                               census_api_key,
                                               puma_tract_mappings,
                                               households_data, persons_data)

    population = generate_synthetic_people_and_households(
        state_id, puma_id, output_dir, allocator, person_model,
        household_model)

    accuracy = Accuracy.from_doppelganger(
        cleaned_data_persons=persons_data,
        cleaned_data_households=households_data,
        marginal_data=marginals,
        population=population)
    logging.info('Absolute Percent Error for state {}, and puma {}: {}'.format(
        state_id, puma_id,
        accuracy.absolute_pct_error().mean()))