def Census_pop_URL_helper(build_url, config, args): urls = [] # get date code for july 1 population numbers for k, v in config['datecodes'].items(): if str(args['year']) == str(k): dc = str(v) # the url for 2010 and earlier is different url2000 = 'https://api.census.gov/data/2000/pep/int_population?get=POP,DATE_DESC&for=__aggLevel__:*&DATE_=12&key=__apiKey__' # state fips required for county level 13-14 FIPS_2 = get_all_state_FIPS_2()['FIPS_2'] # drop puerto rico FIPS_2 = FIPS_2[FIPS_2 != '72'] for c in config['agg_levels']: # this timeframe requires state fips at the county level in the url if '2010' < args['year'] < '2015' and c == 'county': for b in FIPS_2: url = build_url url = url.replace("__aggLevel__", c) url = url.replace("__DateCode__", dc) url = url.replace("population?&", 'cty?') url = url.replace("DATE_CODE", 'DATE_') url = url.replace("&key", "&in=state:" + b + "&key") urls.append(url) else: if args['year'] > '2010': url = build_url url = url.replace("__aggLevel__", c) url = url.replace("__DateCode__", dc) # url date variable different pre 2018 if args['year'] < '2018': url = url.replace("DATE_CODE", 'DATE_') # url for 2011 - 2014 slightly modified if args['year'] < '2015' and c != 'county': url = url.replace("population?&", 'natstprc?') urls.append(url) elif args['year'] == '2010': url = url2000 url = url.replace("__aggLevel__", c) url = url.replace("&in=state:__stateFIPS__", '') if c == "us": url = url.replace("*", "1") userAPIKey = load_api_key( config['api_name']) # (common.py fxn) url = url.replace("__apiKey__", userAPIKey) urls.append(url) return urls
def Census_CBP_URL_helper(build_url, config, args): urls_census = [] FIPS_2 = get_all_state_FIPS_2()['FIPS_2'] for c in FIPS_2: url = build_url url = url.replace("__stateFIPS__", c) # specified NAICS code year depends on year of data if args["year"] in ['2017']: url = url.replace("__NAICS__", "NAICS2017") if args["year"] in ['2012', '2013', '2014', '2015', '2016']: url = url.replace("__NAICS__", "NAICS2012") if args["year"] in ['2010', '2011']: url = url.replace("__NAICS__", "NAICS2007") urls_census.append(url) return urls_census
def BLS_QCEW_URL_helper(build_url, config, args): urls = [] FIPS_2 = get_all_state_FIPS_2()['FIPS_2'] # the url for 2013 earlier is different than the base url (and is a zip file) if args["year"] < '2014': url = build_url url = url.replace('api', 'files') url = url.replace('a/area/__areaFIPS__.csv', 'csv/' + args["year"] + '_annual_by_area.zip') urls.append(url) else: for c in FIPS_2: url = build_url url = url.replace('__areaFIPS__', c + '000') urls.append(url) return urls
def mlu_parse(*, df_list, source, year, **_): """ Combine, parse, and format the provided dataframes :param df_list: list of dataframes to concat and format :param args: dictionary, used to run flowbyactivity.py ('year' and 'source') :return: df, parsed and partially formatted to flowbyactivity specifications """ # concat dataframes df = pd.concat(df_list, sort=False).drop(columns=['SortOrder', 'Region']).rename( columns={'Region or State': 'State'}) # use "melt" fxn to convert colummns into rows df = df.melt(id_vars=["State", "Year"], var_name="FlowName", value_name="FlowAmount") # load fips codes and merge fips = get_all_state_FIPS_2() fips['State'] = fips['State'].apply(lambda x: x.title()) fips['FIPS_2'] = fips['FIPS_2'] + '000' dfm = df.merge(fips, how='left').rename(columns={'FIPS_2': 'Location'}) dfm['Location'] = np.where(dfm['State'] == "U.S. total", US_FIPS, dfm['Location']) # drop null values dfm2 = dfm[~dfm['Location'].isnull()].reset_index(drop=True) dfm3 = assign_fips_location_system(dfm2, year) # sub by year dfm3['Year'] = dfm3['Year'].astype(str) dfm3 = dfm3[dfm3['Year'] == year].reset_index(drop=True) dfm3["Class"] = "Land" dfm3["SourceName"] = source dfm3["ActivityProducedBy"] = None dfm3["ActivityConsumedBy"] = dfm3['FlowName'] dfm3['FlowType'] = 'ELEMENTARY_FLOW' dfm3["Compartment"] = 'ground' dfm3["Unit"] = "Thousand Acres" dfm3['DataReliability'] = 5 # tmp dfm3['DataCollection'] = 5 # tmp return dfm3
def blm_pls_parse(**kwargs): """ Combine, parse, and format the provided dataframes :param kwargs: potential arguments include: dataframe_list: list of dataframes to concat and format args: dictionary, used to run flowbyactivity.py ('year' and 'source') :return: df, parsed and partially formatted to flowbyactivity specifications """ # load arguments necessary for function dataframe_list = kwargs['dataframe_list'] args = kwargs['args'] Location = [] fips = get_all_state_FIPS_2() df = pd.concat(dataframe_list, sort=False) df = df.drop(df[df.FlowAmount == ""].index) for index, row in df.iterrows(): if row['LocationStr'] == "Total": Location.append("00000") else: for i, fips_row in fips.iterrows(): if fips_row["State"] == row['LocationStr']: Location.append(fips_row["FIPS_2"] + "000") df = df.drop(columns=["LocationStr"]) # standardize activity names df = standardize_blm_pls_activity_names(df) # replace withdrawn code df.loc[df['FlowAmount'] == "Q", 'FlowAmount'] = WITHDRAWN_KEYWORD df.loc[df['FlowAmount'] == "N", 'FlowAmount'] = WITHDRAWN_KEYWORD df['FlowName'] = df['ActivityConsumedBy'].copy() df['Location'] = Location df["Class"] = 'Land' df['Compartment'] = "ground" df["LocationSystem"] = "FIPS_2010" df["SourceName"] = 'BLM_PLS' df['Year'] = args["year"] df['Unit'] = "Acres" df['FlowType'] = 'ELEMENTARY_FLOW' df['DataReliability'] = 5 # tmp df['DataCollection'] = 5 #tmp return df
def mlu_parse(**kwargs): """ Combine, parse, and format the provided dataframes :param kwargs: potential arguments include: dataframe_list: list of dataframes to concat and format args: dictionary, used to run flowbyactivity.py ('year' and 'source') :return: df, parsed and partially formatted to flowbyactivity specifications """ # load arguments necessary for function dataframe_list = kwargs['dataframe_list'] args = kwargs['args'] output = pd.DataFrame() # concat dataframes df = pd.concat(dataframe_list, sort=False) data = {} df_columns = df.columns.tolist() location = "" fips = get_all_state_FIPS_2() for index, row in df.iterrows(): if int(row["Year"]) == int(args['year']): if (row["Region or State"] != "Northeast") & \ (row["Region or State"] != "Lake States") & \ (row["Region or State"] != "Corn Belt") & \ (row["Region or State"] != "Northern Plains") & \ (row["Region or State"] != "Appalachian") & \ (row["Region or State"] != "Southeast") & \ (row["Region or State"] != "Delta States") & \ (row["Region or State"] != "Southern Plains") & \ (row["Region or State"] != "Mountain") & \ (row["Region or State"] != "Pacific") & \ (row["Region or State"] != "48 States"): if row['Region or State'] == "U.S. total": location = "00000" else: for i, fips_row in fips.iterrows(): if fips_row["State"] == row['Region or State']: location = fips_row["FIPS_2"] + "000" for col in df_columns: if (col != "SortOrder") & (col != "Region") & \ (col != "Region or State") & (col != "Year"): data["Class"] = "Land" data["SourceName"] = "USDA_ERS_MLU" # flownames are the same as ActivityConsumedBy for # purposes of mapping elementary flows data['FlowName'] = col data["FlowAmount"] = int(row[col]) data["ActivityProducedBy"] = None data["ActivityConsumedBy"] = col data['FlowType'] = 'ELEMENTARY_FLOW' data["Compartment"] = 'ground' data["Location"] = location data["Year"] = int(args['year']) data["Unit"] = "Thousand Acres" data['DataReliability'] = 5 # tmp data['DataCollection'] = 5 # tmp output = output.append(data, ignore_index=True) output = assign_fips_location_system(output, args['year']) return output
def Census_pop_URL_helper(**kwargs): """ This helper function uses the "build_url" input from flowbyactivity.py, which is a base url for data imports that requires parts of the url text string to be replaced with info specific to the data year. This function does not parse the data, only modifies the urls from which data is obtained. :param kwargs: potential arguments include: build_url: string, base url config: dictionary, items in FBA method yaml args: dictionary, arguments specified when running flowbyactivity.py flowbyactivity.py ('year' and 'source') :return: list, urls to call, concat, parse, format into Flow-By-Activity format """ # load the arguments necessary for function build_url = kwargs['build_url'] config = kwargs['config'] args = kwargs['args'] urls = [] # get date code for july 1 population numbers for k, v in config['datecodes'].items(): if str(args['year']) == str(k): dc = str(v) # the url for 2010 and earlier is different url2000 = 'https://api.census.gov/data/2000/pep/int_population?get=' \ 'POP,DATE_DESC&for=__aggLevel__:*&DATE_=12&key=__apiKey__' # state fips required for county level 13-14 FIPS_2 = get_all_state_FIPS_2()['FIPS_2'] # drop puerto rico FIPS_2 = FIPS_2[FIPS_2 != '72'] for c in config['agg_levels']: # this timeframe requires state fips at the county level in the url if '2010' < args['year'] < '2015' and c == 'county': for b in FIPS_2: url = build_url url = url.replace("__aggLevel__", c) url = url.replace("__DateCode__", dc) url = url.replace("population?&", 'cty?') url = url.replace("DATE_CODE", 'DATE_') url = url.replace("&key", "&in=state:" + b + "&key") urls.append(url) else: if args['year'] > '2010': url = build_url url = url.replace("__aggLevel__", c) url = url.replace("__DateCode__", dc) # url date variable different pre 2018 if args['year'] < '2018': url = url.replace("DATE_CODE", 'DATE_') # url for 2011 - 2014 slightly modified if args['year'] < '2015' and c != 'county': url = url.replace("population?&", 'natstprc?') urls.append(url) elif args['year'] == '2010': url = url2000 url = url.replace("__aggLevel__", c) url = url.replace("&in=state:__stateFIPS__", '') if c == "us": url = url.replace("*", "1") userAPIKey = load_api_key(config['api_name']) # (common.py fxn) url = url.replace("__apiKey__", userAPIKey) urls.append(url) return urls
def fiws_parse(**kwargs): """ Combine, parse, and format the provided dataframes :param kwargs: potential arguments include: dataframe_list: list of dataframes to concat and format args: dictionary, used to run flowbyactivity.py ('year' and 'source') :return: df, parsed and partially formatted to flowbyactivity specifications """ # load arguments necessary for function dataframe_list = kwargs['dataframe_list'] args = kwargs['args'] # concat dataframes df = pd.concat(dataframe_list, sort=False) # select data for chosen year, cast year as string to match argument df['Year'] = df['Year'].astype(str) df = df[df['Year'] == args['year']].reset_index(drop=True) # add state fips codes, reading in datasets from common.py fips = get_all_state_FIPS_2().reset_index(drop=True) # ensure capitalization of state names fips['State'] = fips['State'].apply(lambda x: x.title()) fips['StateAbbrev'] = fips['State'].map(us_state_abbrev) # pad zeroes fips['FIPS_2'] = fips['FIPS_2'].apply(lambda x: x.ljust(3 + len(x), '0')) df = pd.merge(df, fips, how='left', left_on='State', right_on='StateAbbrev') # set us location code df.loc[df['State_x'] == 'US', 'FIPS_2'] = US_FIPS # drop "All" in variabledescription2 df.loc[df['VariableDescriptionPart2'] == 'All', 'VariableDescriptionPart2'] = 'drop' # combine variable descriptions to create Activity name and remove ", drop" df['ActivityProducedBy'] = df['VariableDescriptionPart1'] + \ ', ' + df['VariableDescriptionPart2'] df['ActivityProducedBy'] = df['ActivityProducedBy'].str.replace(", drop", "", regex=True) # trim whitespace df['ActivityProducedBy'] = df['ActivityProducedBy'].str.strip() # drop columns df = df.drop(columns=['artificialKey', 'PublicationDate', 'Source', 'ChainType_GDP_Deflator', 'VariableDescriptionPart1', 'VariableDescriptionPart2', 'State_x', 'State_y', 'StateAbbrev', 'unit_desc']) # rename columns df = df.rename(columns={"VariableDescriptionTotal": "Description", "Amount": "FlowAmount", "FIPS_2": "Location"}) # assign flowname, based on comma placement df['FlowName'] = df['Description'].str.split(',').str[0] # add location system based on year of data df['Year'] = df['Year'].astype(int) df.loc[df['Year'] >= 2019, 'LocationSystem'] = 'FIPS_2019' df.loc[df['Year'].between(2015, 2018), 'LocationSystem'] = 'FIPS_2015' df.loc[df['Year'].between(2013, 2014), 'LocationSystem'] = 'FIPS_2013' df.loc[df['Year'].between(2010, 2012), 'LocationSystem'] = 'FIPS_2010' # drop unnecessary rows df = df[df['FlowName'].str.contains("Cash receipts")] # the unit is $1000 USD, so multiply FlowAmount by 1000 and set unit as 'USD' df['FlowAmount'] = df['FlowAmount'].astype(float) df['FlowAmount'] = df['FlowAmount'] * 1000 # hard code data df['Class'] = 'Money' df['SourceName'] = 'USDA_ERS_FIWS' df['Unit'] = 'USD' # Add DQ scores df['DataReliability'] = 5 # tmp df['DataCollection'] = 5 # tmp # sort df df = df.sort_values(['Location', 'FlowName']) # reset index df.reset_index(drop=True, inplace=True) return df
def Census_CBP_URL_helper(*, build_url, year, **_): """ This helper function uses the "build_url" input from flowbyactivity.py, which is a base url for data imports that requires parts of the url text string to be replaced with info specific to the data year. This function does not parse the data, only modifies the urls from which data is obtained. :param build_url: string, base url :param year: year :return: list, urls to call, concat, parse, format into Flow-By-Activity format """ urls_census = [] # This section gets the census data by county instead of by state. # This is only for years 2010 and 2011. This is done because the State # query that gets all counties returns too many results and errors out. if year in ['2010', '2011']: if year == '2011': fips_year = '2010' else: fips_year = '2010' county_fips_df = get_county_FIPS(fips_year) county_fips = county_fips_df.FIPS for d in county_fips: url = build_url state_digit = str(d[0]) + str(d[1]) county_digit = str(d[2]) + str(d[3]) + str(d[4]) url = url.replace("__NAICS__", "NAICS2007") url = url.replace("__stateFIPS__", state_digit) url = url.replace("__countyFIPS__", county_digit) if year == "2010": # These are the counties where data is not available. # s signifies state code and y indicates year. s_02_y_10 = ["105", "195", "198", "230", "275"] s_15_y_10 = ["005"] s_48_y_10 = ["269"] # There are specific counties in various states for the year # 2010 that do not have data. For these counties a URL is not # generated as if there is no data then an error occurs. if state_digit == "02" and county_digit in s_02_y_10 or \ state_digit == "15" and county_digit in s_15_y_10 or \ state_digit == "48" and county_digit in s_48_y_10: pass else: urls_census.append(url) else: # These are the counties where data is not available. # s signifies state code and y indicates year. s_02_y_11 = ["105", "195", "198", "230", "275"] s_15_y_11 = ["005"] s_48_y_11 = ["269", "301"] # There are specific counties in various states for the year # 2011 that do not have data. For these counties a URL is # not generated as if there is no data then an error occurs. if state_digit == "02" and county_digit in s_02_y_11 or \ state_digit == "15" and county_digit in s_15_y_11 or \ state_digit == "48" and county_digit in s_48_y_11: pass else: urls_census.append(url) else: FIPS_2 = get_all_state_FIPS_2()['FIPS_2'] for c in FIPS_2: url = build_url url = url.replace("__stateFIPS__", c) # specified NAICS code year depends on year of data if year in ['2017']: url = url.replace("__NAICS__", "NAICS2017") url = url.replace("__countyFIPS__", "*") if year in ['2012', '2013', '2014', '2015', '2016']: url = url.replace("__NAICS__", "NAICS2012") url = url.replace("__countyFIPS__", "*") urls_census.append(url) return urls_census