Ejemplo n.º 1
0
def download_hospital_data(url, write_out_csv=True):
    hospital_df = pd.read_csv(url)

    # TODO: Move this processing code probably into data processing class

    # TODO: Leverage geocoder class instead of this code
    af = addfips.AddFIPS()

    # Reverse geocoder used to get geocoded fips and county information
    # Note: Progress_apply is used for the timer functionality
    hospital_df['fips'] = hospital_df.apply(
        lambda x: (af.get_county_fips(x['COUNTY'], x['STATE'])), axis=1)

    # clean the BEDS column to make sure all are positive in value, by converting negative beds to 0
    hospital_df['BEDS'] = hospital_df.apply(
        lambda x: (remove_negative_values_replace_with_zero(x['BEDS'])),
        axis=1)

    # write out this data file to csv
    if write_out_csv:
        timestr = time.strftime("%Y%m%d")
        path = 'hospital_data_processed_' + timestr + '.csv'
        hospital_df.to_csv(path, index=False, header=True)

    return hospital_df
Ejemplo n.º 2
0
def add_fips_ids(df, state_col="state", county_col="county", vintage=2015):
    """Add State and County FIPS IDs to a dataframe."""
    # force the columns to be the nullable string types so we have a consistent
    # null value to filter out before feeding to addfips
    df = df.astype({
        state_col: pd.StringDtype(),
        county_col: pd.StringDtype(),
    })
    af = addfips.AddFIPS(vintage=vintage)
    # Lookup the state and county FIPS IDs and add them to the dataframe:
    df["state_id_fips"] = df.apply(lambda x:
                                   (af.get_state_fips(state=x[state_col])
                                    if pd.notnull(x[state_col]) else pd.NA),
                                   axis=1)

    logger.info(
        f"Assigned state FIPS codes for "
        f"{len(df[df.state_id_fips.notnull()])/len(df):.2%} of records.")
    df["county_id_fips"] = df.apply(
        lambda x: (af.get_county_fips(state=x[state_col], county=x[county_col])
                   if pd.notnull(x[county_col]) else pd.NA),
        axis=1)
    # force the code columns to be nullable strings - the leading zeros are
    # important
    df = df.astype({
        "county_id_fips": pd.StringDtype(),
        "state_id_fips": pd.StringDtype(),
    })
    logger.info(
        f"Assigned county FIPS codes for "
        f"{len(df[df.county_id_fips.notnull()])/len(df):.2%} of records.")
    return df
Ejemplo n.º 3
0
    def handle(self, *args, **options):
        baseDir  = settings.BASE_DIR
        af = addfips.AddFIPS()

        with open('%s/utils/crawlers/usgs_station/stations.csv' % (baseDir)) as csv_file:
            csv_reader = csv.reader(csv_file, delimiter=',')
            for line, row in enumerate(csv_reader):
                if line == 0: # header
                    continue

                site_no, url, lat, long, id, county, state, hydrolic_unit_id, id2 = row
                state_fips = af.get_state_fips(state)
                county_fips = af.get_county_fips(county, state=state)

                node = get_object_or_None(models.node, name = site_no)
                if not node:

                    node = models.node()
                    node.name = site_no
                    node.position = Point( float(lat), float(long))
                    node.county = county
                    node.state = getStateAbbrev(state)
                    node.fips_state = state_fips
                    node.fips_county = county_fips

                    node.meta = json.dumps({'site_no': site_no, 'hydrolic_unit_id':hydrolic_unit_id, 'id2': id2 })
                    node.save()
                    log('node %s - insert' % ( site_no ), 'success')
                else:
                    log('node %s - exists' % ( site_no ), 'info')
Ejemplo n.º 4
0
def get_fips(row):
    '''This function gets a row from a dataframe (a series) and returns the FIPS code from the county.'''

    af = addfips.AddFIPS()
    row_as_dict = series_to_dict(row)
    row_with_fips = af.add_county_fips(row_as_dict,
                                       county_field="BUYER_COUNTY",
                                       state_field="BUYER_STATE")
    fips = row_with_fips['fips']
    return (fips)
Ejemplo n.º 5
0
def add_fips_ids(df, state_col="state", county_col="county", vintage=2015):
    """Add State and County FIPS IDs to a dataframe."""
    af = addfips.AddFIPS(vintage=vintage)
    # Lookup the state and county FIPS IDs and add them to the dataframe:
    df["state_id_fips"] = df.apply(lambda x: af.get_state_fips(state=x.state),
                                   axis=1)
    logger.info(
        f"Assigned state FIPS codes for "
        f"{len(df[df.state_id_fips.notnull()])/len(df):.2%} of records.")
    df["county_id_fips"] = df.apply(
        lambda x: af.get_county_fips(state=x.state, county=x.county), axis=1)
    df["county_id_fips"] = df.county_id_fips.fillna(pd.NA)
    logger.info(
        f"Assigned county FIPS codes for "
        f"{len(df[df.county_id_fips.notnull()])/len(df):.2%} of records.")
    return df
Ejemplo n.º 6
0
def getFIPS_General(
        city_tuple,
        values):  # not used as the addfips lib can not recognize some cities
    '''
    This function is to get the fips code from the city name
    : param city_tuple: tuple
    '''
    assert isinstance(values, list)
    res = []
    af = addfips.AddFIPS()
    #     print(city_tuple)
    #     print(values)
    for i in range(len(values)):
        res.append(af.get_county_fips(city_tuple[i][1],
                                      state=city_tuple[i][0]))
    res, values = getSorted(res, values)
    return res
def get_county_fips_code(fname, state_col,county_col):
    '''
    A function that uses the addfips library to assign fips codes accordingly to our data.
    FIPS codes help with geographic plots for our dataset, as they are unified numbers for
    states and counties.
    :fname: string. Name of input file to get FIPS codes from
    :state_col: column number where state name is located
    :county_col: column number where county name is stored.
    '''
    assert isinstance(fname,str) and isinstance(state_col,int) and isinstance(county_col,int)
    assert state_col>=0 and county_col>=0
    af=addfips.AddFIPS()
    header,state= import_data(fname,read='col',samples=None,col_num=state_col)
    header,county= import_data(fname,read='col',samples=None,col_num=county_col)
    assert len(state)==len(county)
    fips=[0]*len(state)
    for i in range(len(state)):
        fips[i]=af.get_county_fips(county[i], state[i])
    return fips
Ejemplo n.º 8
0
def clean(data):
    df = pd.DataFrame(data)

    # Rename the file headers
    df.rename(variables, axis="columns", inplace=True)

    # Reformat dates
    df['date_modeled'] = pd.to_datetime(df['date_modeled'])

    # Turn state names into fips code
    af = addfips.AddFIPS()
    fips = []
    for key, value in df['state'].items():
        fips.append(af.get_state_fips(value))
    df['us_state_fips'] = fips

    # Drop records where state fips is empty
    df = df.dropna(subset=['us_state_fips'])

    # Code model type
    df['model'] = df['model'].map({'polls-only': '0', 'polls-plus': '1'})

    # reorder so that the cnt and new  are always next to each other in the same order
    df = df[[
        'election_cycle', 'race_type', 'name_incumbent', 'name_challenger',
        'name_third', 'us_state_fips', 'tipping', 'vpi', 'win_state_incumbent',
        'win_state_challenger', 'win_state_third',
        'win_ec_if_win_state_incumbent', 'win_ec_if_win_state_challenger',
        'win_state_if_win_ec_incumbent', 'win_state_if_win_ec_challenger',
        'voteshare_incumbent', 'voteshare_challenger', 'voteshare_third',
        'voteshare_other', 'voteshare_lo_incumbent', 'voteshare_lo_challenger',
        'voteshare_lo_third', 'voteshare_lo_other', 'voteshare_hi_incumbent',
        'voteshare_hi_challenger', 'voteshare_hi_third', 'voteshare_hi_other',
        'margin', 'margin_lo', 'margin_hi', 'model', 'date_modeled',
        'simulation_timestamp', 'simulation_count'
    ]]

    # order the records by date
    df = df.sort_values(by=['date_modeled'], ascending=False)

    return df
Ejemplo n.º 9
0
def county_data_for_population_score(st, yr):
    """
    Returns a dataframe containing the 8 pop. char. 
    indicators for each county in the defined state.

    st = abbrevation for state of interest (TX, FL, ...) - string
    yr = end year of interst for the ACS 5-year estimates - integer
    """

    # acs requests
    host, dataset = 'https://api.census.gov/data', 'acs/acs5'
    year = str(yr)  # convert year to string
    # pop. char. indicators - refer to documentation for more info
    variables = [
        "B03002_001E", "B03002_003E", "B17001_001E", "B17001_002E",
        "B16010_001E", "B16010_002E", "B01001_001E", "B01001_003E",
        "B01001_004E", "B01001_027E", "B01001_028E", "B01001_020E",
        "B01001_021E", "B01001_022E", "B01001_023E", "B01001_024E",
        "B01001_025E", "B01001_044E", "B01001_045E", "B01001_046E",
        "B01001_047E", "B01001_048E", "B01001_049E", "B25070_001E",
        "B25070_007E", "B25070_008E", "B25070_009E", "B25070_010E",
        "B25070_011E"
    ]
    # variable names for the pop. char. indicators
    col_names = [
        "name", "tpop_race", "tpop_nhwhite", "tpop_poverty",
        "tpop_impoverished", "tpop_education", "tpop_lthsgraduate", "tpop_age",
        "tpop_lt5M", "tpop_5to9M", "tpop_lt5F", "tpop_5to9F", "tpop_65to66M",
        "tpop_67to69M", "tpop70to74M", "tpop_75to79M", "tpop_80to84M",
        "tpop_over85M", "tpop_65to66F", "tpop_67to69F", "tpop70to74F",
        "tpop_75to79F", "tpop_80to84F", "tpop_over85F", "tpop_incomeonhousing",
        "tpop_30to34.9IonH", "tpop_35to39.9IonH", "tpop_40to49.9IonH",
        "tpop_over50IonH", "tpop_NAIonH", "state", "county"
    ]
    # loop through the variables for defined state and year
    # store results in a dataframe
    for variable in variables:
        print("Fetching data for {} variable".format(variable))
        url = "/".join([host, year, dataset])
        get_vars = ["NAME"] + variables
        predicates = {}
        predicates["get"] = ",".join(get_vars)
        predicates["for"] = "county:*"
        predicates["in"] = "state:" + addfips.AddFIPS().get_state_fips(st)
        r = requests.get(url, params=predicates)
        df = pd.DataFrame(columns=col_names, data=r.json()[1:])
    #create column of census tract names
    name_information = df.name
    #modify the dataframe to remove the name column and format values as integers
    df = df[df.columns[~df.columns.isin(['name'])]].astype(int)
    # create dataframe from the retrieved data
    indicators_df = pd.DataFrame()
    # do the calculations for each indicator and store it in the newly made dataframe
    indicators_df = indicators_df.assign(
        name=name_information.apply(lambda x: x.split(',')[0]),
        population=df.tpop_race,
        nwpopulation=df.tpop_nhwhite,
        nwpopulation_p=100 * (df.tpop_nhwhite / df.tpop_race),
        poverty=df.tpop_impoverished,
        poverty_p=100 * (df.tpop_impoverished / df.tpop_poverty),
        educational_attainment_p=100 *
        (df.tpop_lthsgraduate / df.tpop_education),
        age_0to9=df.tpop_lt5M + df.tpop_5to9M + df.tpop_lt5F + df.tpop_5to9F,
        age_0to9_p=100 *
        ((df.tpop_lt5M + df.tpop_5to9M + df.tpop_lt5F + df.tpop_5to9F) /
         df.tpop_age),
        age_10to64=df.tpop_age -
        (df.tpop_lt5M + df.tpop_5to9M + df.tpop_lt5F + df.tpop_5to9F) -
        (df.tpop_65to66M + df.tpop_67to69M + df.tpop70to74M + df.tpop_75to79M +
         df.tpop_80to84M + df.tpop_over85M + df.tpop_65to66F +
         df.tpop_67to69F + df.tpop70to74F + df.tpop_75to79F + df.tpop_80to84F +
         df.tpop_over85F),
        age_10to64_p=100 *
        ((df.tpop_age -
          (df.tpop_lt5M + df.tpop_5to9M + df.tpop_lt5F + df.tpop_5to9F) -
          (df.tpop_65to66M + df.tpop_67to69M + df.tpop70to74M +
           df.tpop_75to79M + df.tpop_80to84M + df.tpop_over85M +
           df.tpop_65to66F + df.tpop_67to69F + df.tpop70to74F +
           df.tpop_75to79F + df.tpop_80to84F + df.tpop_over85F)) /
         (df.tpop_age)),
        age_65=df.tpop_65to66M + df.tpop_67to69M + df.tpop70to74M +
        df.tpop_75to79M + df.tpop_80to84M + df.tpop_over85M + df.tpop_65to66F +
        df.tpop_67to69F + df.tpop70to74F + df.tpop_75to79F + df.tpop_80to84F +
        df.tpop_over85F,
        age_65_p=100 *
        ((df.tpop_65to66M + df.tpop_67to69M + df.tpop70to74M +
          df.tpop_75to79M + df.tpop_80to84M + df.tpop_over85M +
          df.tpop_65to66F + df.tpop_67to69F + df.tpop70to74F +
          df.tpop_75to79F + df.tpop_80to84F + df.tpop_over85F) / df.tpop_age),
        housing_burden_p=100 *
        ((df["tpop_30to34.9IonH"] + df["tpop_35to39.9IonH"] +
          df["tpop_40to49.9IonH"] + df["tpop_over50IonH"]) /
         (df.tpop_incomeonhousing - df.tpop_NAIonH)),
        county=df.county)
    #add health data to the indicators dataframe
    #this is for TX only. from the TX DSHS
    if st == 'TX':
        TX_health_indicators_df = pd.read_csv(
            './raw_data/TX_health_indicators.csv').rename(
                index=str, columns={'county': 'name'})

        indicators_df = indicators_df.merge(TX_health_indicators_df[[
            'name', 'low_birth_weight', 'cardiovascular_disease'
        ]],
                                            how='left',
                                            on="name")

    del df, TX_health_indicators_df

    # save as dataframe and/or CSV file
    if QA:
        indicators_df.to_csv('county_population_df.csv', index=False)
        return indicators_df
    else:
        return indicators_df
Ejemplo n.º 10
0
def add_geodata_to_dataframe(ut, st, ogdf):
    """
    Returns modified dataframe with geodata added

    ut = unit of data (tract or county) - string
    st = abbreviation for state of interest - string
    ogdf = dataframe that will be modified - dataframe
    """

    # filter shapefile by state and add it to the dataframe
    # for tract level
    if ut == "tract":
        print("Fetching geographic {} data".format(ut))
        # read in shp file
        if st == 'TX':
            tempdf = gpd.read_file(
                "./boundary_data/tl_2018_48_tract/tl_2018_48_tract.shp")[[
                    'GEOID', 'geometry'
                ]]
        else:
            tempdf = gpd.read_file("./boundary_data/usatracts/usa_tracts.shp")[
                ['STATEFP', 'GEOID', 'geometry']]
            tempdf = tempdf.rename(index=str, columns={'STATEFP': 'state'})
        # filter shp file based on state
        tempdf = tempdf[tempdf['state'] == addfips.AddFIPS().get_state_fips(
            st)]
        # add geodata
        print("Adding geographic {} data to the dataframe".format(ut))
        geo_dataframe = tempdf.merge(ogdf, how='left', on='geoid')
    # for county level
    elif ut == "county":
        print("Fetching geographic {} data".format(ut))
        # read in shp file
        tempdf = gpd.read_file(
            "./boundary_data/tl_2019_us_county/tl_2019_us_county.shp")[[
                'STATEFP', 'COUNTYFP', 'GEOID', 'geometry'
            ]]
        tempdf = tempdf.rename(index=str,
                               columns={
                                   'STATEFP': 'state',
                                   'COUNTYFP': 'county'
                               })
        # fiter shp file based on state
        tempdf = tempdf[tempdf['state'] == addfips.AddFIPS().get_state_fips(
            st)]
        tempdf['county'] = tempdf['county'].astype(int)
        # add geodata
        print("Adding geographic {} data to the dataframe".format(ut))
        geo_dataframe = tempdf.merge(ogdf, how='left', on='county')
    # unit is not clearly defined or unavailable
    else:
        print(
            "Please check 'ut' input to make sure it is either 'tract' or 'county', not {}"
            .format(ut))

    del tempdf

    # save as dataframe and/or CSV file
    if QA:
        geo_dataframe.to_csv('geocounty_population_df.csv', index=False)
        return geo_dataframe
    else:
        return geo_dataframe
              'r') as csv_summary_file_yesterday:
        csv_summary_yesterday = csv_summary_file_yesterday.read()
    if filecmp.cmp(csv_summary_file_name_today_tmp,
                   csv_summary_file_name_yesterday):
        print('Website not updated: ', url)
        os.remove(csv_summary_file_name_today_tmp)
        sys.exit(1)
    else:
        print('Begin crawling')

##===process today's csv summary data===
csv_summary_today = csv_summary_today_tmp
if os.path.exists(csv_summary_file_name_today_tmp):
    os.rename(csv_summary_file_name_today_tmp, csv_summary_file_name_today)
    print('Save today\'s csv summary data: ', csv_summary_file_name_today)

df = pd.read_csv(csv_summary_file_name_today)[:-1]
df['Case Count'] = df['Case Count'].astype('int64')
df['Death Count'] = df['Death Count'].astype('int64')
df['Hospitalized Count'] = df['Hospitalized Count'].astype('int64')

df2 = df.groupby('County', as_index=True)[[
    'Case Count', 'Death Count', 'Hospitalized Count'
]].sum().reset_index()
af = addfips.AddFIPS()
df2['fips'] = df2['County'].apply(lambda x: af.get_county_fips(x, 'OH'))

df2.to_csv(csv_county_summary_file_name_today, index=None, header=True)
print('Save today\'s csv county summary data: ',
      csv_county_summary_file_name_today)
Ejemplo n.º 12
0
from . import settings

import addfips
from dateutil import parser as dateutil_parser
import hashlib
import os
import pendulum
import requests
import threading
import us

_fips = addfips.AddFIPS()
_fip_state_reverse = {}
_fip_state_full_reverse = {}
_fip_county_reverse = {}
# Patches to the 'addfips' library, for counties
_fip_patches = {
    # Added everywhere.
    '_global': {
        'all': '000',
        'unknown': '999',
    },
    'AK': {
        # There are technically two codes; this is used for ambiguous
        # entries.
        'aleutian islands': '010',
        # These show up in the USDA census data.  They're to be trusted.
        'anchorage borough': '020',
        'juneau borough': '110',
        'kuskokwim division': '160',
        'prince of wales-outer ketchikan census area': '201',
Ejemplo n.º 13
0
 def __init__(self, county_fips_download_url):
     self.af = addfips.AddFIPS()
     self.download_county_fips_info(county_fips_download_url)
Ejemplo n.º 14
0
 def get_fips_code(county, state_abr):
     af = addfips.AddFIPS()
     code = af.get_county_fips(county, state=state_abr)
     return code