def main(): pd.set_option('display.expand_frame_repr', False) pd.set_option('display.precision', 2) #to download we must identify the tables containing the variables interest to us. #use ACS documentation, in particular Table Shells (https://www.census.gov/programs-surveys/acs/technical-documentation/summary-file-documentation.html) #can use cenusdata.search to find given text patterns. We can limit the output to the relevenant variables censusdata.search('acs5', 2015, 'label', 'unemploy')[160:170] censusdata.search('acs5', 2015, 'concept', 'education')[730:790] #using censusdata.printtable to show vars in table censusdata.printtable(censusdata.censustable('acs5', 2015, 'B23025')) censusdata.printtable(censusdata.censustable('acs5', 2015, 'B15003')) #after getting relevant variables, we need to identify the geographies. #we are going to get block groups in Cook County IL #1. look for FIPS code #2. find identifiers for all counties within IL to find Cook #1 #print(censusdata.geographies(censusdata.censusgeo([('state','*')]), 'acs5', 2015)) #IL is 17 #2 #print(censusdata.geographies(censusdata.censusgeo([('state','17'), ('county', '*')]), 'acs5', 2015)) #cook is 031 #once we have identified variables and geos of interest, #we can download the data using censusdata.download. compute variables for the percent unemployed and the percent w no hs degree cook_cnty = censusdata.download('acs5', 2015, censusdata.censusgeo([('state','17'), ('county','031'), ('block group','*')]), ['B23025_003E', 'B23025_005E', 'B15003_001E', 'B15003_002E', 'B15003_003E','B15003_004E', 'B15003_005E', 'B15003_006E', 'B15003_007E', 'B15003_008E','B15003_009E', 'B15003_010E', 'B15003_011E', 'B15003_012E', 'B15003_013E','B15003_014E', 'B15003_015E', 'B15003_016E']) cook_cnty['percent_unemployed'] = cook_cnty.B23025_005E / cook_cnty.B23025_003E * 100 cook_cnty['percent_nohs'] = (cook_cnty.B15003_002E + cook_cnty.B15003_003E + cook_cnty.B15003_004E + cook_cnty.B15003_005E + cook_cnty.B15003_006E + cook_cnty.B15003_007E + cook_cnty.B15003_008E + cook_cnty.B15003_009E + cook_cnty.B15003_010E + cook_cnty.B15003_011E + cook_cnty.B15003_012E + cook_cnty.B15003_013E + cook_cnty.B15003_014E + cook_cnty.B15003_015E + cook_cnty.B15003_016E) / cook_cnty.B15003_001E * 100 cook_cnty = cook_cnty[['percent_unemployed', 'percent_nohs']] print(cook_cnty.describe()) #to show the 30 block groups in cook w highest rate of unemployment and the percent w no hs degree print(cook_cnty.sort_values('percent_unemployed', ascending=False).head(30)) #show correlation print(cook_cnty.corr()) censusdata.exportcsv('cook_data.csv', cook_cnty)
def test_search(self): censusdata.search('acs5', 2015, 'concept', 'unweighted sample') censusdata.search('acs5', 2018, 'concept', 'SEX BY AGE') censusdata.search( 'acs5', 2015, 'concept', lambda value: re.search('unweighted sample', value, re.IGNORECASE) and re.search('housing', value, re.IGNORECASE)) censusdata.search('sf1', 2010, 'concept', 'JUVENILE FACILITIES') censusdata.search('acsse', 2019, 'concept', 'SEX BY AGE')
def GetFieldList(table, year): year = alt_search_year(year) table = str(table).upper() cl = cd.search('acs5', year, 'concept', table) gl = cd.search('acs5', year, 'group', table) fl = cl + gl fields = [f for f in fl if f[0].split("_")[0] == table and f[0][-1] == 'E'] field_list = ["{0} {1}".format(f[0], f[2]) for f in fields] return field_list
def GetFieldList(table, year): """ Returns a list of all fields for a particular table ID Args: table (str): Table ID year (int): ACS year""" year = alt_search_year(year) table = str(table).upper() cl = cd.search('acs5', year, 'concept', table) gl = cd.search('acs5', year, 'group', table) fl = cl + gl fields = [f for f in fl if f[0].split("_")[0] == table and f[0][-1] == 'E'] field_list = ["{0} {1}".format(f[0], f[2]) for f in fields] return field_list
def main(verbose=False, data_dir='../data/'): if verbose: pd.set_option('display.expand_frame_repr', False) pd.set_option('display.precision', 2) print("Available race variables:") print(censusdata.search('acs5', 2015, 'label', 'race')) print("Table to download:") censusdata.printtable(censusdata.censustable('acs5', 2015, 'B02001')) variables = list(censusdata.censustable('acs5', 2015, 'B02001').keys()) # remove variables for margin of errors variables = list(filter(lambda x: x[-1] != 'M', variables)) if verbose: print("Variables:") print(variables) illinois_demo = censusdata.download( 'acs5', 2015, censusdata.censusgeo([('state', '17'), ('tract', '*')]), variables) illinois_demo.rename( { 'B02001_001E': 'total', 'B02001_002E': 'white', 'B02001_003E': 'black', 'B02001_004E': 'native', 'B02001_005E': 'asian', 'B02001_006E': 'pacific', 'B02001_007E': 'other', 'B02001_008E': 'two_or_more', 'B02001_009E': 'two_or_more_including_other', 'B02001_010E': 'two_or_more_excluding_other' }, axis='columns', inplace=True) illinois_demo.other = illinois_demo.other + \ illinois_demo['two_or_more_including_other'] + \ illinois_demo['two_or_more_excluding_other'] illinois_demo = illinois_demo[[ 'total', 'white', 'black', 'native', 'asian', 'pacific', 'other' ]] total = illinois_demo.total illinois_demo.white /= total illinois_demo.black /= total illinois_demo.native /= total illinois_demo.asian /= total illinois_demo.pacific /= total illinois_demo.other /= total illinois_demo['censusgeo'] = illinois_demo.index illinois_demo.reset_index(level=0, drop=True, inplace=True) illinois_demo['tract'] = illinois_demo['censusgeo'].apply( lambda x: x.geo[2][1]).astype(str) illinois_demo['county'] = illinois_demo['censusgeo'].apply( lambda x: x.geo[1][1]) illinois_demo['county_name'] = illinois_demo['censusgeo'].apply( lambda x: x.name.split(',')[1][1:-7]) illinois_demo.drop('censusgeo', axis='columns', inplace=True) if verbose: print(illinois_demo.sample(frac=10 / len(illinois_demo))) print(illinois_demo.describe()) illinois_demo = illinois_demo.loc[illinois_demo.county_name == 'Cook'] illinois_demo.to_csv(data_dir + 'Illinois2015CensusTractsDemographics.csv') print("Successfully downloaded Illinois demographic data.") url = "https://github.com/uscensusbureau/citysdk/raw/master/v2/GeoJSON/500k/2015/17/tract.json" fname = 'Illinois2015CensusTracts.json' target = data_dir + fname download_file(url, target) print("Successfully downloaded Illinois census tract shapefile.")
output = 'data/cleaned/01_Demographic' # location of table shells where I"ve flagged which variables to use input_drive = 'data/raw' table_shell = os.path.join(input_drive, 'ACS2017_Table_Shells.xlsx') xl = pd.ExcelFile(table_shell) table_shell_df = xl.parse(xl.sheet_names[0]) # variables I've flagged to use use_vars = table_shell_df[table_shell_df.Use == 1] print(use_vars[['TableID', 'Stub', 'Use']]) use_vars.to_csv(os.path.join(input_drive, 'ACS_variables.csv')) variables = use_vars.TableID.tolist() # Use the census data package # Examples of functionality censusdata.search('acs5', 2017, 'label', 'unemploy') # censusdata.search('acs5', 2017, 'concept', 'education') censusdata.printtable(censusdata.censustable('acs5', 2017, 'B23025')) censusdata.geographies(censusdata.censusgeo([('state', '*')]), 'acs5', 2017) censusdata.geographies( censusdata.censusgeo([('state', '08'), ('county', '*')]), 'acs5', 2017) # doesn't seem like the C variables work, so remove them variables = [var for var in variables if 'C' not in var] variables = [var for var in variables if "B17002" not in var] # loop through all variables and merge data together count = 0 for variable in variables: print(variable) data = censusdata_pull(variable)
county_id ##dictionary with name as key and census geo object ##list(county_id.keys())[0:10] #austin_counties = ['021', '055', '209', '453', '491'] austin_counties = ['453'] ## Get all the census tracts from the counties ## state> county> tract austin_tracts_names = [] for county_id in austin_counties: county_tracts = censusdata.geographies(censusdata.censusgeo([('state', '48'), ('county', county_id), ('tract', '*')]), 'acs5', 2016) austin_tracts_names.extend(county_tracts.keys()) ## Search for all variables by the label 'housing', first 10 censusdata.search('acs5', 2016, 'label', 'white', 'profile') #find all census tracts within the MSA and generate % white of every census tract austin_msa = censusdata.download('acs5', 2016, censusdata.censusgeo([('metropolitan statistical area/micropolitan statistical area', '12420')]), ['DP05_0032PE']) ##i think the above one returns the right %white population, turns out there a lot of yt ppl ##think about: how does construction of hispanic/latino as a race shape this? ##acs: "The data on race are based on self-identification and the categories on the form generally reflect a social definition of race" ## AH: Yes! Here's a good paper on this -- https://www.annualreviews.org/doi/abs/10.1146/annurev.soc.29.010202.100006 ##austin_msa = censusdata.download('acs5', 2016, #censusdata.censusgeo([('state', '48')]),
import os import argparse parser = argparse.ArgumentParser() parser.add_argument('--search', action='store_true', help="To perform a search for variables") parser.add_argument('--get', action='store_true', help="To load variables data from the CENSUS data and store to csv") parser.add_argument('--store', action='store_true', help="To load data from csv into the database") args = parser.parse_args() if args.search: # to search for variables in CENSUS data vars = censusdata.search('acs5', 2018, 'label', 'geoid', tabletype='detail') print(f"Found {len(vars)} matching variables.") # prints all retrieved census data variables to file with open("search_results.txt", "w") as f: for v in vars: f.write(str(v)+"\n") if args.get: # to download the data from the CENSUS df = download_data('useful_variables.txt') # saves the retrieved data to a csv df.to_csv('data.csv', index=False) if args.store: table_name = "amritsin_hw1" schema_name = "acs"
import censusdata x = censusdata.search('acs5', 2015, 'label', 'unemploy')[160:170] print(x)
#Creating dictionary mapping for census tract to zipcode #to be able to map the census data to education data census_zipcode_relation_filename = 'zcta_tract.csv' census_zipcode_relation = pd.read_csv(census_zipcode_relation_filename, \ delimiter=',', dtype=str) ny_tract_to_zipcode = census_zipcode_relation[census_zipcode_relation\ ['STATE'] == '36'][['TRACT', 'ZCTA5']] tract_zipcode = {} for row in ny_tract_to_zipcode.itertuples(): tract_zipcode[row[1]] = row[2] #Obtaining 2010 Census median income data for each borough in NYC censusdata.search('acs5', 2015, 'label', 'median income') censusdata.censustable('acs5', 2015, 'B06011') median_income_bronx = censusdata.download('acs5', 2015, \ censusdata.censusgeo([('state', '36'), ('county', '005'), \ ('tract', '*')]), ['B06011_001E']) median_income_kings = censusdata.download('acs5', 2015, \ censusdata.censusgeo([('state', '36'), ('county', '047'), \ ('tract', '*')]), ['B06011_001E']) median_income_ny = censusdata.download('acs5', 2015, \ censusdata.censusgeo([('state', '36'), ('county', '061'), \ ('tract', '*')]), ['B06011_001E']) median_income_queens = censusdata.download('acs5', 2015, \ censusdata.censusgeo([('state', '36'), ('county', '081'), \ ('tract', '*')]), ['B06011_001E']) median_income_richmond = censusdata.download('acs5', 2015, \ censusdata.censusgeo([('state', '36'), ('county', '085'), \
# # ) # # # print( getattr(censusdata.download,'name') ) # c = censusdata.download('acs5', 2015, censusdata.censusgeo([('county', 'Kings County')]), # ['B01001_001E', 'B01001_020E']) # print(c.describe()) ############################################### import pandas as pd import censusdata pd.set_option('display.expand_frame_repr', False) pd.set_option('display.precision', 2) import statsmodels.formula.api as sm #Adding all the age columns ageTableCols = censusdata.search('acs1', 2018, 'concept', 'age') tableColDict = dict() tableColCodeArray = [] for i in range(0, len(ageTableCols)): tableColCode = str( ageTableCols[i]).split(", ")[0].strip("(").strip(")").strip("'") tableColSection = str( ageTableCols[i]).split(", ")[1].strip("(").strip(")").strip("'") tableColVar = str( ageTableCols[i]).split(", ")[2].strip("(").strip(")").strip("'") tableColDict.update({tableColCode: [tableColSection, tableColVar]}) tableColCodeArray.append(tableColCode) a = [ 'B01001A_001E', 'B01]1A_002E', 'B01001A_003E', 'B01001A_004E', 'B01001A_005E'