Exemple #1
0
def _load_raw_data(year: int) -> pd.DataFrame:
    if year == 2000:
        df = __combine_two_2000_files()
    elif year == 2005:
        nhgis_path = src_path('demographics',
                              'nhgis0017_ds195_20095_2009_blck_grp.csv')
        df = pd.read_csv(nhgis_path)
    elif year == 2010:
        nhgis_file = src_path('demographics',
                              'nhgis0014_ds176_20105_2010_blck_grp.csv')
        df = pd.read_csv(nhgis_file, encoding='latin1')

    return df
Exemple #2
0
def __combine_two_2000_files() -> pd.DataFrame:
    """
    NHGIS outputs the 2000 Census in two files for some reason. Combine
    them here.
    """
    file1 = src_path('demographics', 'nhgis0020_ds152_2000_blck_grp.csv')
    file2 = src_path('demographics', 'nhgis0020_ds147_2000_blck_grp.csv')
    tmp1 = pd.read_csv(file1)
    tmp2 = pd.read_csv(file2)
    # Drop all duplicate columns except gisjoin
    cols1 = set(tmp1.columns)
    cols2 = set(tmp2.columns)
    keep_set = cols1.symmetric_difference(cols2)
    bg = ['STATEA', 'COUNTYA', 'TRACTA', 'BLCK_GRPA']
    keep_cols1 = [x for x in tmp1.columns if x in keep_set]
    keep_cols2 = [x for x in tmp2.columns if x in keep_set]
    tmp1 = tmp1[['GISJOIN'] + bg + keep_cols1]
    tmp2 = tmp2[['GISJOIN'] + keep_cols2]
    rawdata = stata_merge(tmp1, tmp2, on='GISJOIN', assertval=3)
    return rawdata
Exemple #3
0
def _download_bg_shapefile(state_fips: str, resolution: str) -> None:
    target_folder = src_path('cartographic_shapes')
    bg_url = (
        'https://www2.census.gov/geo/' +
        f'tiger/GENZ2010/gz_2010_{state_fips}_150_00_{resolution}.zip'
    )
    zip_file_name = os.path.split(bg_url)[1]
    zip_file_path = os.path.join(target_folder, zip_file_name)
    if os.path.isfile(zip_file_path):
        print(f'{zip_file_name} already on disk')
        return
    else:
        urllib.request.urlretrieve(bg_url, zip_file_path)
Exemple #4
0
def block_pop_2000():
    """ Year 2000 pop, block-level """

    nhgis_path = src_path('demographics', 'nhgis0009_ds147_2000_block.csv')

    usecols = ['STATEA', 'COUNTYA', 'TRACTA', 'BLOCKA', 'FXS001']
    df = pd.read_csv(nhgis_path, usecols=usecols, header=0)

    df = df.rename(columns={'FXS001': 'pop2000'})

    df = df.rename(columns=lambda x: x.lower())

    df['fips'] = (df['statea'].astype(str).str.zfill(2) +
                  df['countya'].astype(str).str.zfill(3))
    df['block_id'] = (df['fips'] + df['tracta'].astype(str).str.zfill(6) +
                      df['blocka'].astype(str).str.zfill(4))

    df = df.drop(['statea', 'countya', 'tracta', 'blocka', 'fips'])

    df = df.set_index('block_id')

    return df
Exemple #5
0
def load_fips_cbsa():
    """ NOTE: does not include all FIPS """
    # Read or download the src
    filepath = src_path('cbsa2fipsxw.csv')
    if not os.path.isfile(filepath):
        url = (r'https://www.nber.org/cbsa-csa-fips-county-crosswalk/'
               'cbsa2fipsxw.csv')
        urllib.request.urlretrieve(url, filepath)
    else:
        pass
    df = pd.read_csv(filepath)

    df = df.drop(0, axis=0)
    df = df.drop(['metrodivisioncode'], axis=1)

    # Deal with metro/micro
    df = _convert_binary(df, 'metropolitanmicropolitanstatis',
                         'Metropolitan Statistical Area', 'micropolitan')
    df = _convert_binary(df, 'centraloutlyingcounty', 'Outlying',
                         'outlying_county')

    df = df.rename(
        columns={
            'cbsacode': 'cbsa',
            'cbsatitle': 'cbsa_name',
            'countycountyequivalent': 'county_name',
            'statename': 'state_name',
            'fipsstatecode': 'state',
            'fipscountycode': 'county',
        })

    for col in ('cbsa', 'state', 'county'):
        df[col] = df[col].astype(int)

    df['fips'] = (df['state'].astype(str).str.zfill(2) +
                  df['county'].astype(str).str.zfill(3))

    return df
Exemple #6
0
def block_demogs(year: int):
    assert year == 2010

    csv_path = src_path('census', 'cross_sectional_data', 'nhgis_2010_block',
                        'nhgis0013_ds172_2010_block.csv')
    new_names = codebooks('block', year)
    columns = list(new_names.keys())
    df = pd.DataFrame()
    chunk_size = 10**6
    for chunk_df in pd.read_csv(csv_path, chunksize=chunk_size):
        print('next chunk')
        chunk_df = chunk_df[columns]
        chunk_df = chunk_df.rename(columns=new_names)
        chunk_df = _age_bins(chunk_df)
        chunk_df = _cast_dtype(chunk_df)
        df = df.append(chunk_df)
        del chunk_df

    # completing race variables
    df['race_black'] = df['race_black_no_hisp'] + df['race_black_hisp']
    df['race_native'] = df['race_native_no_hisp'] + df['race_native_hisp']
    df['race_asian'] = df['race_asian_no_hisp'] + df['race_asian_hisp']
    df['race_island'] = df['race_island_no_hisp'] + df['race_island_hisp']
    df['race_other'] = df['race_other_no_hisp'] + df['race_other_hisp']
    df['race_two_or_more'] = (df['race_two_or_more_no_hisp'] +
                              df['race_two_or_more_hisp'])

    df = df.drop([
        'race_black_no_hisp', 'race_native_no_hisp', 'race_asian_no_hisp',
        'race_island_no_hisp', 'race_other_no_hisp',
        'race_two_or_more_no_hisp', 'race_black_hisp', 'race_native_hisp',
        'race_asian_hisp', 'race_island_hisp', 'race_other_hisp',
        'race_two_or_more_hisp'
    ],
                 axis=1)

    return df
Exemple #7
0
def _bg_zip_path(state_fips: str) -> str:
    zip_path = src_path('census', 'shapefile_blockgroup', 'zipped',
                        f'gz_2010_{state_fips}_150_00_500k.zip')
    return zip_path