def _load_raw_data(year: int) -> pd.DataFrame: if year == 2000: df = __combine_two_2000_files() elif year == 2005: nhgis_path = src_path('demographics', 'nhgis0017_ds195_20095_2009_blck_grp.csv') df = pd.read_csv(nhgis_path) elif year == 2010: nhgis_file = src_path('demographics', 'nhgis0014_ds176_20105_2010_blck_grp.csv') df = pd.read_csv(nhgis_file, encoding='latin1') return df
def __combine_two_2000_files() -> pd.DataFrame: """ NHGIS outputs the 2000 Census in two files for some reason. Combine them here. """ file1 = src_path('demographics', 'nhgis0020_ds152_2000_blck_grp.csv') file2 = src_path('demographics', 'nhgis0020_ds147_2000_blck_grp.csv') tmp1 = pd.read_csv(file1) tmp2 = pd.read_csv(file2) # Drop all duplicate columns except gisjoin cols1 = set(tmp1.columns) cols2 = set(tmp2.columns) keep_set = cols1.symmetric_difference(cols2) bg = ['STATEA', 'COUNTYA', 'TRACTA', 'BLCK_GRPA'] keep_cols1 = [x for x in tmp1.columns if x in keep_set] keep_cols2 = [x for x in tmp2.columns if x in keep_set] tmp1 = tmp1[['GISJOIN'] + bg + keep_cols1] tmp2 = tmp2[['GISJOIN'] + keep_cols2] rawdata = stata_merge(tmp1, tmp2, on='GISJOIN', assertval=3) return rawdata
def _download_bg_shapefile(state_fips: str, resolution: str) -> None: target_folder = src_path('cartographic_shapes') bg_url = ( 'https://www2.census.gov/geo/' + f'tiger/GENZ2010/gz_2010_{state_fips}_150_00_{resolution}.zip' ) zip_file_name = os.path.split(bg_url)[1] zip_file_path = os.path.join(target_folder, zip_file_name) if os.path.isfile(zip_file_path): print(f'{zip_file_name} already on disk') return else: urllib.request.urlretrieve(bg_url, zip_file_path)
def block_pop_2000(): """ Year 2000 pop, block-level """ nhgis_path = src_path('demographics', 'nhgis0009_ds147_2000_block.csv') usecols = ['STATEA', 'COUNTYA', 'TRACTA', 'BLOCKA', 'FXS001'] df = pd.read_csv(nhgis_path, usecols=usecols, header=0) df = df.rename(columns={'FXS001': 'pop2000'}) df = df.rename(columns=lambda x: x.lower()) df['fips'] = (df['statea'].astype(str).str.zfill(2) + df['countya'].astype(str).str.zfill(3)) df['block_id'] = (df['fips'] + df['tracta'].astype(str).str.zfill(6) + df['blocka'].astype(str).str.zfill(4)) df = df.drop(['statea', 'countya', 'tracta', 'blocka', 'fips']) df = df.set_index('block_id') return df
def load_fips_cbsa(): """ NOTE: does not include all FIPS """ # Read or download the src filepath = src_path('cbsa2fipsxw.csv') if not os.path.isfile(filepath): url = (r'https://www.nber.org/cbsa-csa-fips-county-crosswalk/' 'cbsa2fipsxw.csv') urllib.request.urlretrieve(url, filepath) else: pass df = pd.read_csv(filepath) df = df.drop(0, axis=0) df = df.drop(['metrodivisioncode'], axis=1) # Deal with metro/micro df = _convert_binary(df, 'metropolitanmicropolitanstatis', 'Metropolitan Statistical Area', 'micropolitan') df = _convert_binary(df, 'centraloutlyingcounty', 'Outlying', 'outlying_county') df = df.rename( columns={ 'cbsacode': 'cbsa', 'cbsatitle': 'cbsa_name', 'countycountyequivalent': 'county_name', 'statename': 'state_name', 'fipsstatecode': 'state', 'fipscountycode': 'county', }) for col in ('cbsa', 'state', 'county'): df[col] = df[col].astype(int) df['fips'] = (df['state'].astype(str).str.zfill(2) + df['county'].astype(str).str.zfill(3)) return df
def block_demogs(year: int): assert year == 2010 csv_path = src_path('census', 'cross_sectional_data', 'nhgis_2010_block', 'nhgis0013_ds172_2010_block.csv') new_names = codebooks('block', year) columns = list(new_names.keys()) df = pd.DataFrame() chunk_size = 10**6 for chunk_df in pd.read_csv(csv_path, chunksize=chunk_size): print('next chunk') chunk_df = chunk_df[columns] chunk_df = chunk_df.rename(columns=new_names) chunk_df = _age_bins(chunk_df) chunk_df = _cast_dtype(chunk_df) df = df.append(chunk_df) del chunk_df # completing race variables df['race_black'] = df['race_black_no_hisp'] + df['race_black_hisp'] df['race_native'] = df['race_native_no_hisp'] + df['race_native_hisp'] df['race_asian'] = df['race_asian_no_hisp'] + df['race_asian_hisp'] df['race_island'] = df['race_island_no_hisp'] + df['race_island_hisp'] df['race_other'] = df['race_other_no_hisp'] + df['race_other_hisp'] df['race_two_or_more'] = (df['race_two_or_more_no_hisp'] + df['race_two_or_more_hisp']) df = df.drop([ 'race_black_no_hisp', 'race_native_no_hisp', 'race_asian_no_hisp', 'race_island_no_hisp', 'race_other_no_hisp', 'race_two_or_more_no_hisp', 'race_black_hisp', 'race_native_hisp', 'race_asian_hisp', 'race_island_hisp', 'race_other_hisp', 'race_two_or_more_hisp' ], axis=1) return df
def _bg_zip_path(state_fips: str) -> str: zip_path = src_path('census', 'shapefile_blockgroup', 'zipped', f'gz_2010_{state_fips}_150_00_500k.zip') return zip_path