Exemple #1
0
def population_generator():
    # households_data_dirty, person_data_dirty = load_household_and_population_dfs()
    safe_mkdir(OUTPUT_DIR)
    pumas_to_go = set(sorted(puma_df_clean.values.tolist()))
    total_pumas = len(pumas_to_go)
    completed = []
    for puma in puma_df_clean:
        gen_puma = 'state_{}_puma_{}_households.csv'.format(STATE, puma)
        if gen_puma in os.listdir(OUTPUT_DIR):
            completed.append(puma)
            pumas_to_go.remove(puma)
    print("Already completed {} of {} pumas: {}".format(
        len(completed), total_pumas, ",".join(sorted(completed))))
    print("{} pumas remaining: {}".format(len(pumas_to_go),
                                          ",".join(sorted(pumas_to_go))))

    puma_tract_mappings = 'input/sample_data/2010_puma_tract_mapping.txt'
    configuration = Configuration.from_file('input/sample_data/config.json')
    preprocessor = Preprocessor.from_config(configuration.preprocessing_config)
    households_raw_data = PumsData.from_csv(
        'output/{}/household_pums_data.csv'.format(AOI_NAME))
    persons_raw_data = PumsData.from_csv(
        'output/{}/person_pums_data.csv'.format(AOI_NAME))
    results = [
        _generate_for_puma_data(households_raw_data, persons_raw_data,
                                preprocessor, puma_tract_mappings, puma_id)
        for puma_id in pumas_to_go
    ]

    sum(results)
def population_generator():
    # households_data_dirty, person_data_dirty = load_household_and_population_dfs()
    pumas_to_go = set(puma_df_clean.values.tolist())

    for puma in puma_df_clean:
        gen_puma = 'state_{}_puma_{}_households.csv'.format(STATE, puma)
        if gen_puma in os.listdir(output_dir):
            print(puma)
            pumas_to_go.remove(puma)

    puma_tract_mappings = 'input/2010_puma_tract_mapping.txt'
    configuration = Configuration.from_file('input/config.json')
    preprocessor = Preprocessor.from_config(configuration.preprocessing_config)

    for puma_id in pumas_to_go:
        households_data = PumsData.from_csv('input/{}_household_pums_data.csv'.format(AOI_NAME)).clean(household_fields,
                                                                                                       preprocessor,
                                                                                                       state=str(
                                                                                                           int(STATE)),
                                                                                                       puma=str(int(
                                                                                                           puma_id)))
        persons_data = PumsData.from_csv('input/{}_person_pums_data.csv'.format(AOI_NAME)).clean(persons_fields,
                                                                                                 preprocessor,
                                                                                                 state=str(int(STATE)),
                                                                                                 puma=str(int(puma_id)))
        person_segmenter = lambda x: None
        household_segmenter = lambda x: None
        print("loaded")

        household_model, person_model = create_bayes_net(
            STATE, puma_id, output_dir,
            households_data, persons_data, configuration,
            person_segmenter, household_segmenter
        )

        marginals, allocator = download_tract_data(
            STATE, puma_id, output_dir, census_api_key, puma_tract_mappings,
            households_data, persons_data
        )

        print('Allocated {}'.format(puma_id))
        population = generate_synthetic_people_and_households(
            STATE, puma_id, output_dir, allocator,
            person_model, household_model
        )

        print('Generated {}'.format(puma_id))
        accuracy = Accuracy.from_doppelganger(
            cleaned_data_persons=persons_data,
            cleaned_data_households=households_data,
            marginal_data=marginals,
            population=population
        )

        logging.info('Absolute Percent Error for state {}, and puma {}: {}'.format(STATE, puma_id,
                                                                                   accuracy.absolute_pct_error().mean()))
    _combine_and_synthesize()
Exemple #3
0
def main():
    args = parse_args()
    puma_tract_mappings = args.puma_tract_mappings_csv
    state_id = args.state_id
    puma_id = args.puma_id
    census_api_key = args.census_api_key
    config_file = args.config_file
    output_dir = args.output_dir
    db_host = args.db_host
    db_database = args.db_database
    db_schema = args.db_schema
    db_user = args.db_user
    db_password = args.db_password

    configuration = Configuration.from_file(config_file)

    households_data, persons_data = download_and_load_pums_data(
        output_dir, state_id, puma_id, configuration, db_host, db_database,
        db_schema, db_user, db_password)

    household_model, person_model = create_bayes_net(
        state_id, puma_id, output_dir, households_data, persons_data,
        configuration, person_segmenter, household_segmenter)

    marginals, allocator = download_tract_data(state_id, puma_id, output_dir,
                                               census_api_key,
                                               puma_tract_mappings,
                                               households_data, persons_data)

    population = generate_synthetic_people_and_households(
        state_id, puma_id, output_dir, allocator, person_model,
        household_model)

    accuracy = Accuracy.from_doppelganger(
        cleaned_data_persons=persons_data,
        cleaned_data_households=households_data,
        marginal_data=marginals,
        population=population)
    logging.info('Absolute Percent Error for state {}, and puma {}: {}'.format(
        state_id, puma_id,
        accuracy.absolute_pct_error().mean()))
Exemple #4
0
def main():
    args = parse_args()
    puma_tract_mappings = args.puma_tract_mappings_csv
    state_id = args.state_id
    puma_id = args.puma_id
    census_api_key = args.census_api_key
    config_file = args.config_file
    output_dir = args.output_dir
    db_host = args.db_host
    db_database = args.db_database
    db_schema = args.db_schema
    db_user = args.db_user
    db_password = args.db_password

    configuration = Configuration.from_file(config_file)

    households_data, persons_data = download_and_load_pums_data(
                output_dir, state_id, puma_id,
                configuration, db_host, db_database, db_schema, db_user, db_password
            )

    household_model, person_model = create_bayes_net(
                state_id, puma_id, output_dir,
                households_data, persons_data, configuration,
                person_segmenter, household_segmenter
            )

    allocator = download_tract_data(
                state_id, puma_id, output_dir, census_api_key, puma_tract_mappings,
                households_data, persons_data
            )

    generate_synthetic_people_and_households(
                state_id, puma_id, output_dir, allocator,
                person_model, household_model
            )
                                 na_filter=True)
    # Read __downloaded__ (see link above) household level PUMS (may take a while...)
    household_pums_df = pd.read_csv('input/ss14h{}.csv'.format(STATE_ABBREVIATION.lower()), na_values=['N.A'],
                                    na_filter=True)
    # filter household data and population data to AOI
    person_df_in_aoi = person_pums_df[person_pums_df['PUMA10'].isin(puma_df_clean.values)]
    person_df_in_aoi.loc[:, 'puma'] = person_df_in_aoi['PUMA10']
    household_df_in_aoi = household_pums_df[household_pums_df['PUMA10'].isin(puma_df_clean.values)]
    household_df_in_aoi.loc[:, 'puma'] = person_df_in_aoi['PUMA10']
    # Save for later use
    person_df_in_aoi.to_csv('input/{}_person_pums_data.csv'.format(AOI_NAME), index_label='index')
    household_df_in_aoi.to_csv('input/{}_household_pums_data.csv'.format(AOI_NAME), index_label='index')
    return household_df_in_aoi, person_df_in_aoi


configuration = Configuration.from_file('./input/sample_data/config.json')
household_fields = tuple(set(
    field.name for field in allocation.DEFAULT_HOUSEHOLD_FIELDS).union(
    set(configuration.household_fields)
))
persons_fields = tuple(set(
    field.name for field in allocation.DEFAULT_PERSON_FIELDS).union(
    set(configuration.person_fields)
))


def population_generator():
    # households_data_dirty, person_data_dirty = load_household_and_population_dfs()
    pumas_to_go = set(puma_df_clean.values.tolist())

    for puma in puma_df_clean: