def convert_blackhurst_data_to_kg_per_year(df, **kwargs): """ Load BEA Make After Redefinition data to convert Blackhurst IO dataframe units to gallon per year :param df: df, FBA format :param kwargs: kwargs includes "attr" - dictionary, attribute data from method yaml for activity set :return: transformed fba df """ # load the bea make table bmt = load_fba_w_standardized_units( datasource='BEA_Make_AR', year=kwargs['attr']['allocation_source_year'], flowclass='Money', download_FBA_if_missing=kwargs['download_FBA_if_missing']) # drop rows with flowamount = 0 bmt = bmt[bmt['FlowAmount'] != 0] # check on units of dfs before merge compare_df_units(df, bmt) bh_df_revised = pd.merge( df, bmt[['FlowAmount', 'ActivityProducedBy', 'Location']], left_on=['ActivityConsumedBy', 'Location'], right_on=['ActivityProducedBy', 'Location']) bh_df_revised.loc[:, 'FlowAmount'] = ((bh_df_revised['FlowAmount_x']) * (bh_df_revised['FlowAmount_y'])) bh_df_revised.loc[:, 'Unit'] = 'kg' # drop columns bh_df_revised = bh_df_revised.drop( columns=["FlowAmount_x", "FlowAmount_y", 'ActivityProducedBy_y']) bh_df_revised = bh_df_revised.rename( columns={"ActivityProducedBy_x": "ActivityProducedBy"}) return bh_df_revised
def disaggregate_cropland(fba_w_sector, attr, method, year, sector_column): """ In the event there are 4 (or 5) digit naics for cropland at the county level, use state level harvested cropland to create ratios :param fba_w_sector: df, CoA cropland data, FBA format with sector columns :param attr: dictionary, attribute data from method yaml for activity set :param year: str, year of data :param sector_column: str, the sector column on which to make df modifications (SectorProducedBy or SectorConsumedBy) :param attr: dictionary, attribute data from method yaml for activity set :return: df, CoA cropland data disaggregated """ # tmp drop NoneTypes fba_w_sector = replace_NoneType_with_empty_cells(fba_w_sector) # drop pastureland data crop = fba_w_sector.loc[fba_w_sector[sector_column].apply(lambda x: x[0:3]) != '112'].reset_index(drop=True) # drop sectors < 4 digits crop = crop[crop[sector_column].apply(lambda x: len(x) > 3)].reset_index( drop=True) # create tmp location crop = crop.assign(Location_tmp=crop['Location'].apply(lambda x: x[0:2])) # load the relevant state level harvested cropland by naics naics = load_fba_w_standardized_units(datasource="USDA_CoA_Cropland_NAICS", year=year, flowclass='Land') # subset the harvested cropland by naics naics = naics[naics['FlowName'] == 'AG LAND, CROPLAND, HARVESTED'].reset_index(drop=True) # drop the activities that include '&' naics = naics[~naics['ActivityConsumedBy'].str.contains('&')].reset_index( drop=True) # add sectors naics = add_sectors_to_flowbyactivity( naics, sectorsourcename=method['target_sector_source']) # estimate suppressed data by equally allocating parent to child naics naics = estimate_suppressed_data(naics, 'SectorConsumedBy', 3, 'USDA_CoA_Cropland_NAICS') # add missing fbs fields naics = clean_df(naics, flow_by_sector_fields, fbs_fill_na_dict) # aggregate sectors to create any missing naics levels group_cols = fbs_default_grouping_fields # group_cols = [e for e in group_cols if e not in ('SectorProducedBy', 'SectorConsumedBy')] # group_cols.append(sector_column) naics2 = sector_aggregation(naics, group_cols) # add missing naics5/6 when only one naics5/6 associated with a naics4 naics3 = sector_disaggregation(naics2) # drop rows where FlowAmount 0 # naics3 = naics3[~((naics3['SectorProducedBy'] == '') & (naics3['SectorConsumedBy'] == ''))] naics3 = naics3.loc[naics3['FlowAmount'] != 0] # create ratios naics4 = sector_ratios(naics3, sector_column) # create temporary sector column to match the two dfs on naics4 = naics4.assign( Location_tmp=naics4['Location'].apply(lambda x: x[0:2])) # tmp drop Nonetypes naics4 = replace_NoneType_with_empty_cells(naics4) # check units in prep for merge compare_df_units(crop, naics4) # for loop through naics lengths to determine naics 4 and 5 digits to disaggregate for i in range(4, 6): # subset df to sectors with length = i and length = i + 1 crop_subset = crop.loc[crop[sector_column].apply( lambda x: i + 1 >= len(x) >= i)] crop_subset = crop_subset.assign( Sector_tmp=crop_subset[sector_column].apply(lambda x: x[0:i])) # if duplicates drop all rows df = crop_subset.drop_duplicates(subset=['Location', 'Sector_tmp'], keep=False).reset_index(drop=True) # drop sector temp column df = df.drop(columns=["Sector_tmp"]) # subset df to keep the sectors of length i df_subset = df.loc[df[sector_column].apply(lambda x: len(x) == i)] # subset the naics df where naics length is i + 1 naics_subset = \ naics4.loc[naics4[sector_column].apply(lambda x: len(x) == i + 1)].reset_index(drop=True) naics_subset = naics_subset.assign( Sector_tmp=naics_subset[sector_column].apply(lambda x: x[0:i])) # merge the two df based on locations df_subset = pd.merge(df_subset, naics_subset[[ sector_column, 'FlowAmountRatio', 'Sector_tmp', 'Location_tmp' ]], how='left', left_on=[sector_column, 'Location_tmp'], right_on=['Sector_tmp', 'Location_tmp']) # create flow amounts for the new NAICS based on the flow ratio df_subset.loc[:, 'FlowAmount'] = df_subset['FlowAmount'] * df_subset[ 'FlowAmountRatio'] # drop rows of 0 and na df_subset = df_subset[df_subset['FlowAmount'] != 0] df_subset = df_subset[~df_subset['FlowAmount'].isna()].reset_index( drop=True) # drop columns df_subset = df_subset.drop( columns=[sector_column + '_x', 'FlowAmountRatio', 'Sector_tmp']) # rename columns df_subset = df_subset.rename( columns={sector_column + '_y': sector_column}) # tmp drop Nonetypes df_subset = replace_NoneType_with_empty_cells(df_subset) # add new rows of data to crop df crop = pd.concat([crop, df_subset], sort=True).reset_index(drop=True) # clean up df crop = crop.drop(columns=['Location_tmp']) # equally allocate any further missing naics crop = allocate_dropped_sector_data(crop, 'NAICS_6') # pasture data pasture = \ fba_w_sector.loc[fba_w_sector[sector_column].apply(lambda x: x[0:3]) == '112'].reset_index(drop=True) # concat crop and pasture fba_w_sector = pd.concat([pasture, crop], sort=True).reset_index(drop=True) # fill empty cells with NoneType fba_w_sector = replace_strings_with_NoneType(fba_w_sector) return fba_w_sector
def disaggregate_pastureland(fba_w_sector, attr, method, year, sector_column): """ The USDA CoA Cropland irrigated pastureland data only links to the 3 digit NAICS '112'. This function uses state level CoA 'Land in Farms' to allocate the county level acreage data to 6 digit NAICS. :param fba_w_sector: df, the CoA Cropland dataframe after linked to sectors :param attr: dictionary, attribute data from method yaml for activity set :param year: str, year of data being disaggregated :param sector_column: str, the sector column on which to make df modifications (SectorProducedBy or SectorConsumedBy) :return: df, the CoA cropland dataframe with disaggregated pastureland data """ # tmp drop NoneTypes fba_w_sector = replace_NoneType_with_empty_cells(fba_w_sector) # subset the coa data so only pastureland p = fba_w_sector.loc[fba_w_sector[sector_column].apply(lambda x: x[0:3]) == '112'].reset_index(drop=True) if len(p) != 0: # add temp loc column for state fips p = p.assign(Location_tmp=p['Location'].apply(lambda x: x[0:2])) # load usda coa cropland naics df_f = load_fba_w_standardized_units( datasource='USDA_CoA_Cropland_NAICS', year=year, flowclass='Land') # subset to land in farms data df_f = df_f[df_f['FlowName'] == 'FARM OPERATIONS'] # subset to rows related to pastureland df_f = df_f.loc[df_f['ActivityConsumedBy'].apply(lambda x: x[0:3]) == '112'] # drop rows with "&' df_f = df_f[~df_f['ActivityConsumedBy'].str.contains('&')] # create sector columns df_f = add_sectors_to_flowbyactivity( df_f, sectorsourcename=method['target_sector_source']) # estimate suppressed data by equal allocation df_f = estimate_suppressed_data(df_f, 'SectorConsumedBy', 3, 'USDA_CoA_Cropland_NAICS') # create proportional ratios group_cols = fba_wsec_default_grouping_fields group_cols = [ e for e in group_cols if e not in ('ActivityProducedBy', 'ActivityConsumedBy') ] df_f = allocate_by_sector(df_f, 'proportional', group_cols) # tmp drop NoneTypes df_f = replace_NoneType_with_empty_cells(df_f) # drop naics = '11 df_f = df_f[df_f[sector_column] != '11'] # drop 000 in location df_f = df_f.assign(Location=df_f['Location'].apply(lambda x: x[0:2])) # check units before merge compare_df_units(p, df_f) # merge the coa pastureland data with land in farm data df = p.merge(df_f[[sector_column, 'Location', 'FlowAmountRatio']], how='left', left_on="Location_tmp", right_on="Location") # multiply the flowamount by the flowratio df.loc[:, 'FlowAmount'] = df['FlowAmount'] * df['FlowAmountRatio'] # drop columns and rename df = df.drop(columns=[ 'Location_tmp', sector_column + '_x', 'Location_y', 'FlowAmountRatio' ]) df = df.rename(columns={ sector_column + '_y': sector_column, "Location_x": 'Location' }) # drop rows where sector = 112 and then concat with original fba_w_sector fba_w_sector = fba_w_sector[fba_w_sector[sector_column].apply( lambda x: x[0:3]) != '112'].reset_index(drop=True) fba_w_sector = pd.concat([fba_w_sector, df], sort=True).reset_index(drop=True) # fill empty cells with NoneType fba_w_sector = replace_strings_with_NoneType(fba_w_sector) return fba_w_sector
def dataset_allocation_method(flow_subset_mapped, attr, names, method, k, v, aset, aset_names, download_FBA_if_missing): """ Method of allocation using a specified data source :param flow_subset_mapped: FBA subset mapped using federal elementary flow list :param attr: dictionary, attribute data from method yaml for activity set :param names: list, activity names in activity set :param method: dictionary, FBS method yaml :param k: str, the datasource name :param v: dictionary, the datasource parameters :param aset: dictionary items for FBS method yaml :param aset_names: list, activity set names :param download_FBA_if_missing: bool, indicate if missing FBAs should be downloaded from Data Commons :return: df, allocated activity names """ from flowsa.validation import compare_df_units # add parameters to dictionary if exist in method yaml fba_dict = {} if 'allocation_flow' in attr: fba_dict['flowname_subset'] = attr['allocation_flow'] if 'allocation_compartment' in attr: fba_dict['compartment_subset'] = attr['allocation_compartment'] if 'clean_allocation_fba' in attr: fba_dict['clean_fba'] = attr['clean_allocation_fba'] if 'clean_allocation_fba_w_sec' in attr: fba_dict['clean_fba_w_sec'] = attr['clean_allocation_fba_w_sec'] # load the allocation FBA fba_allocation_wsec = \ load_map_clean_fba(method, attr, fba_sourcename=attr['allocation_source'], df_year=attr['allocation_source_year'], flowclass=attr['allocation_source_class'], geoscale_from=attr['allocation_from_scale'], geoscale_to=v['geoscale_to_use'], download_FBA_if_missing=download_FBA_if_missing, **fba_dict) # subset fba datasets to only keep the sectors associated # with activity subset log.info("Subsetting %s for sectors in %s", attr['allocation_source'], k) fba_allocation_subset = \ get_fba_allocation_subset(fba_allocation_wsec, k, names, flowSubsetMapped=flow_subset_mapped, allocMethod=attr['allocation_method']) # if there is an allocation helper dataset, modify allocation df if 'helper_source' in attr: log.info("Using the specified allocation help for subset of %s", attr['allocation_source']) fba_allocation_subset = \ allocation_helper(fba_allocation_subset, attr, method, v, download_FBA_if_missing=download_FBA_if_missing) # create flow allocation ratios for each activity flow_alloc_list = [] if 'Context' in fba_allocation_subset.columns: group_cols = fba_mapped_wsec_default_grouping_fields else: group_cols = fba_wsec_default_grouping_fields group_cols = [ e for e in group_cols if e not in ('ActivityProducedBy', 'ActivityConsumedBy') ] n_allocated = [] for n in names: log.debug("Creating allocation ratios for %s", n) # if n has already been called, drop all rows of data # containing n to avoid double counting when there are two # activities in each ACB and APB columns fba_allocation_subset = fba_allocation_subset[~( (fba_allocation_subset[fba_activity_fields[0]].isin(n_allocated)) | (fba_allocation_subset[fba_activity_fields[1]].isin(n_allocated)) )].reset_index(drop=True) fba_allocation_subset_2 = \ get_fba_allocation_subset(fba_allocation_subset, k, [n], flowSubsetMapped=flow_subset_mapped, allocMethod=attr['allocation_method'], activity_set_names=aset_names) if len(fba_allocation_subset_2) == 0: log.info("No data found to allocate %s", n) else: flow_alloc = \ allocate_by_sector(fba_allocation_subset_2, attr, attr['allocation_method'], group_cols, flowSubsetMapped=flow_subset_mapped) flow_alloc = flow_alloc.assign(FBA_Activity=n) n_allocated.append(n) flow_alloc_list.append(flow_alloc) flow_allocation = pd.concat(flow_alloc_list, ignore_index=True) # generalize activity field names to enable link to main fba source log.info("Generalizing activity columns in subset of %s", attr['allocation_source']) flow_allocation = collapse_activity_fields(flow_allocation) # check for issues with allocation ratios check_allocation_ratios(flow_allocation, aset, method, attr) # create list of sectors in the flow allocation df, # drop any rows of data in the flow df that aren't in list sector_list = flow_allocation['Sector'].unique().tolist() # subset fba allocation table to the values in the activity # list, based on overlapping sectors flow_subset_mapped = flow_subset_mapped.loc[ (flow_subset_mapped[fbs_activity_fields[0]].isin(sector_list)) | (flow_subset_mapped[fbs_activity_fields[1]].isin(sector_list))] # check if fba and allocation dfs have the same LocationSystem log.info("Checking if flowbyactivity and allocation " "dataframes use the same location systems") check_if_location_systems_match(flow_subset_mapped, flow_allocation) # merge fba df w/flow allocation dataset log.info("Merge %s and subset of %s", k, attr['allocation_source']) for i, j in activity_fields.items(): # check units compare_df_units(flow_subset_mapped, flow_allocation) # create list of columns to merge on if 'allocation_merge_columns' in attr: fa_cols = \ ['Location', 'Sector', 'FlowAmountRatio', 'FBA_Activity'] + \ attr['allocation_merge_columns'] l_cols = \ ['Location', j[1]["flowbysector"], j[0]["flowbyactivity"]] + \ attr['allocation_merge_columns'] r_cols = ['Location', 'Sector', 'FBA_Activity'] + \ attr['allocation_merge_columns'] else: fa_cols = ['Location', 'Sector', 'FlowAmountRatio', 'FBA_Activity'] l_cols = ['Location', j[1]["flowbysector"], j[0]["flowbyactivity"]] r_cols = ['Location', 'Sector', 'FBA_Activity'] flow_subset_mapped = \ flow_subset_mapped.merge(flow_allocation[fa_cols], left_on=l_cols, right_on=r_cols, how='left') # merge the flowamount columns flow_subset_mapped.loc[:, 'FlowAmountRatio'] =\ flow_subset_mapped['FlowAmountRatio_x'].fillna( flow_subset_mapped['FlowAmountRatio_y']) # fill null rows with 0 because no allocation info flow_subset_mapped['FlowAmountRatio'] = \ flow_subset_mapped['FlowAmountRatio'].fillna(0) # drop rows where there is no allocation data fbs = flow_subset_mapped.dropna(subset=['Sector_x', 'Sector_y'], how='all').reset_index() # calculate flow amounts for each sector log.info("Calculating new flow amounts using flow ratios") fbs.loc[:, 'FlowAmount'] = fbs['FlowAmount'] * fbs['FlowAmountRatio'] # drop columns log.info("Cleaning up new flow by sector") fbs = fbs.drop(columns=[ 'Sector_x', 'FlowAmountRatio_x', 'Sector_y', 'FlowAmountRatio_y', 'FlowAmountRatio', 'FBA_Activity_x', 'FBA_Activity_y' ]) return fbs
def allocation_helper(df_w_sector, attr, method, v, download_FBA_if_missing): """ Function to help allocate activity names using secondary df :param df_w_sector: df, includes sector columns :param attr: dictionary, attribute data from method yaml for activity set :param method: dictionary, FBS method yaml :param v: dictionary, the datasource parameters :param download_FBA_if_missing: bool, indicate if missing FBAs should be downloaded from Data Commons or run locally :return: df, with modified fba allocation values """ from flowsa.validation import compare_df_units # add parameters to dictionary if exist in method yaml fba_dict = {} if 'helper_flow' in attr: fba_dict['flowname_subset'] = attr['helper_flow'] if 'clean_helper_fba' in attr: fba_dict['clean_fba'] = attr['clean_helper_fba'] if 'clean_helper_fba_wsec' in attr: fba_dict['clean_fba_w_sec'] = attr['clean_helper_fba_wsec'] # load the allocation FBA helper_allocation = \ load_map_clean_fba(method, attr, fba_sourcename=attr['helper_source'], df_year=attr['helper_source_year'], flowclass=attr['helper_source_class'], geoscale_from=attr['helper_from_scale'], geoscale_to=v['geoscale_to_use'], download_FBA_if_missing=download_FBA_if_missing, **fba_dict) # run sector disagg to capture any missing lower level naics helper_allocation = sector_disaggregation(helper_allocation) # generalize activity field names to enable link to water withdrawal table helper_allocation = collapse_activity_fields(helper_allocation) # drop any rows not mapped helper_allocation = \ helper_allocation[helper_allocation['Sector'].notnull()] # drop columns helper_allocation = \ helper_allocation.drop(columns=['Activity', 'Min', 'Max']) # rename column helper_allocation = \ helper_allocation.rename(columns={"FlowAmount": 'HelperFlow'}) # determine the df_w_sector column to merge on df_w_sector = replace_strings_with_NoneType(df_w_sector) sec_consumed_list = \ df_w_sector['SectorConsumedBy'].drop_duplicates().values.tolist() sec_produced_list = \ df_w_sector['SectorProducedBy'].drop_duplicates().values.tolist() # if a sector field column is not all 'none', that is the column to merge if all(v is None for v in sec_consumed_list): sector_col_to_merge = 'SectorProducedBy' elif all(v is None for v in sec_produced_list): sector_col_to_merge = 'SectorConsumedBy' else: log.error('There is not a clear sector column to base ' 'merge with helper allocation dataset') # merge allocation df with helper df based on sectors, # depending on geo scales of dfs if (attr['helper_from_scale'] == 'state') and \ (attr['allocation_from_scale'] == 'county'): helper_allocation.loc[:, 'Location_tmp'] = \ helper_allocation['Location'].apply(lambda x: x[0:2]) df_w_sector.loc[:, 'Location_tmp'] = \ df_w_sector['Location'].apply(lambda x: x[0:2]) # merge_columns.append('Location_tmp') compare_df_units(df_w_sector, helper_allocation) modified_fba_allocation =\ df_w_sector.merge( helper_allocation[['Location_tmp', 'Sector', 'HelperFlow']], how='left', left_on=['Location_tmp', sector_col_to_merge], right_on=['Location_tmp', 'Sector']) modified_fba_allocation = \ modified_fba_allocation.drop(columns=['Location_tmp']) elif (attr['helper_from_scale'] == 'national') and \ (attr['allocation_from_scale'] != 'national'): compare_df_units(df_w_sector, helper_allocation) modified_fba_allocation = \ df_w_sector.merge(helper_allocation[['Sector', 'HelperFlow']], how='left', left_on=[sector_col_to_merge], right_on=['Sector']) else: compare_df_units(df_w_sector, helper_allocation) modified_fba_allocation =\ df_w_sector.merge( helper_allocation[['Location', 'Sector', 'HelperFlow']], left_on=['Location', sector_col_to_merge], right_on=['Location', 'Sector'], how='left') # load bea codes that sub for naics bea = return_bea_codes_used_as_naics() # replace sector column and helperflow value if the sector column to # merge is in the bea list to prevent dropped data modified_fba_allocation['Sector'] = \ np.where(modified_fba_allocation[sector_col_to_merge].isin(bea), modified_fba_allocation[sector_col_to_merge], modified_fba_allocation['Sector']) modified_fba_allocation['HelperFlow'] = \ np.where(modified_fba_allocation[sector_col_to_merge].isin(bea), modified_fba_allocation['FlowAmount'], modified_fba_allocation['HelperFlow']) # modify flow amounts using helper data if 'multiplication' in attr['helper_method']: # if missing values (na or 0), replace with national level values replacement_values =\ helper_allocation[helper_allocation['Location'] == US_FIPS].reset_index(drop=True) replacement_values = \ replacement_values.rename( columns={"HelperFlow": 'ReplacementValue'}) compare_df_units(modified_fba_allocation, replacement_values) modified_fba_allocation = modified_fba_allocation.merge( replacement_values[['Sector', 'ReplacementValue']], how='left') modified_fba_allocation.loc[:, 'HelperFlow'] = \ modified_fba_allocation['HelperFlow'].fillna( modified_fba_allocation['ReplacementValue']) modified_fba_allocation.loc[:, 'HelperFlow'] =\ np.where(modified_fba_allocation['HelperFlow'] == 0, modified_fba_allocation['ReplacementValue'], modified_fba_allocation['HelperFlow']) # replace non-existent helper flow values with a 0, # so after multiplying, don't have incorrect value associated with # new unit modified_fba_allocation['HelperFlow'] =\ modified_fba_allocation['HelperFlow'].fillna(value=0) modified_fba_allocation.loc[:, 'FlowAmount'] = \ modified_fba_allocation['FlowAmount'] * \ modified_fba_allocation['HelperFlow'] # drop columns modified_fba_allocation =\ modified_fba_allocation.drop( columns=["HelperFlow", 'ReplacementValue', 'Sector']) elif attr['helper_method'] == 'proportional': modified_fba_allocation =\ proportional_allocation_by_location_and_activity( modified_fba_allocation, sector_col_to_merge) modified_fba_allocation['FlowAmountRatio'] =\ modified_fba_allocation['FlowAmountRatio'].fillna(0) modified_fba_allocation.loc[:, 'FlowAmount'] = \ modified_fba_allocation['FlowAmount'] * \ modified_fba_allocation['FlowAmountRatio'] modified_fba_allocation =\ modified_fba_allocation.drop( columns=['FlowAmountRatio', 'HelperFlow', 'Sector']) elif attr['helper_method'] == 'proportional-flagged': # calculate denominators based on activity and 'flagged' column modified_fba_allocation =\ modified_fba_allocation.assign( Denominator=modified_fba_allocation.groupby( ['FlowName', 'ActivityConsumedBy', 'Location', 'disaggregate_flag'])['HelperFlow'].transform('sum')) modified_fba_allocation = modified_fba_allocation.assign( FlowAmountRatio=modified_fba_allocation['HelperFlow'] / modified_fba_allocation['Denominator']) modified_fba_allocation =\ modified_fba_allocation.assign( FlowAmount=modified_fba_allocation['FlowAmount'] * modified_fba_allocation['FlowAmountRatio']) modified_fba_allocation =\ modified_fba_allocation.drop( columns=['disaggregate_flag', 'Sector', 'HelperFlow', 'Denominator', 'FlowAmountRatio']) # run sector aggregation modified_fba_allocation = \ sector_aggregation(modified_fba_allocation, fba_wsec_default_grouping_fields) # drop rows of 0 modified_fba_allocation =\ modified_fba_allocation[ modified_fba_allocation['FlowAmount'] != 0].reset_index(drop=True) modified_fba_allocation.loc[modified_fba_allocation['Unit'] == 'gal/employee', 'Unit'] = 'gal' # option to scale up fba values if 'scaled' in attr['helper_method']: log.info("Scaling %s to FBA values", attr['helper_source']) modified_fba_allocation = \ dynamically_import_fxn( attr['allocation_source'], attr["scale_helper_results"])( modified_fba_allocation, attr, download_FBA_if_missing=download_FBA_if_missing) return modified_fba_allocation
def convert_statcan_data_to_US_water_use(df, attr): """ Use Canadian GDP data to convert 3 digit canadian water use to us water use: - canadian gdp - us gdp :param df: df, FBA format :param attr: dictionary, attribute data from method yaml for activity set :return: df, FBA format, flowamounts converted """ # load Canadian GDP data gdp = load_fba_w_standardized_units(datasource='StatCan_GDP', year=attr['allocation_source_year'], flowclass='Money') # drop 31-33 gdp = gdp[gdp['ActivityProducedBy'] != '31-33'] gdp = gdp.rename(columns={"FlowAmount": "CanDollar"}) # check units before merge compare_df_units(df, gdp) # merge df df_m = pd.merge(df, gdp[['CanDollar', 'ActivityProducedBy']], how='left', left_on='ActivityConsumedBy', right_on='ActivityProducedBy') df_m['CanDollar'] = df_m['CanDollar'].fillna(0) df_m = df_m.drop(columns=["ActivityProducedBy_y"]) df_m = df_m.rename(columns={"ActivityProducedBy_x": "ActivityProducedBy"}) df_m = df_m[df_m['CanDollar'] != 0] exchange_rate = get_Canadian_to_USD_exchange_rate( str(attr['allocation_source_year'])) exchange_rate = float(exchange_rate) # convert to mgal/USD df_m.loc[:, 'FlowAmount'] = df_m['FlowAmount'] / (df_m['CanDollar'] / exchange_rate) df_m.loc[:, 'Unit'] = 'Mgal/USD' df_m = df_m.drop(columns=["CanDollar"]) # convert Location to US df_m.loc[:, 'Location'] = US_FIPS df_m = assign_fips_location_system(df_m, str(attr['allocation_source_year'])) # load us gdp # load Canadian GDP data us_gdp_load = load_fba_w_standardized_units( datasource='BEA_GDP_GrossOutput', year=attr['allocation_source_year'], flowclass='Money') # load bea crosswalk cw_load = load_bea_crosswalk() cw = cw_load[['BEA_2012_Detail_Code', 'NAICS_2012_Code']].drop_duplicates() cw = cw[cw['NAICS_2012_Code'].apply( lambda x: len(str(x)) == 3)].drop_duplicates().reset_index(drop=True) # merge us_gdp = pd.merge(us_gdp_load, cw, how='left', left_on='ActivityProducedBy', right_on='BEA_2012_Detail_Code') us_gdp = us_gdp.drop( columns=['ActivityProducedBy', 'BEA_2012_Detail_Code']) # rename columns us_gdp = us_gdp.rename(columns={'NAICS_2012_Code': 'ActivityProducedBy'}) # agg by naics us_gdp = aggregator(us_gdp, fba_default_grouping_fields) us_gdp = us_gdp.rename(columns={'FlowAmount': 'us_gdp'}) # determine annual us water use df_m2 = pd.merge(df_m, us_gdp[['ActivityProducedBy', 'us_gdp']], how='left', left_on='ActivityConsumedBy', right_on='ActivityProducedBy') df_m2.loc[:, 'FlowAmount'] = df_m2['FlowAmount'] * (df_m2['us_gdp']) df_m2.loc[:, 'Unit'] = 'Mgal' df_m2 = df_m2.rename( columns={'ActivityProducedBy_x': 'ActivityProducedBy'}) df_m2 = df_m2.drop(columns=['ActivityProducedBy_y', 'us_gdp']) return df_m2
def convert_blackhurst_data_to_kg_per_employee(df_wsec, attr, method, **kwargs): """ Load BLS employment data and use to transform original units to gallons per employee :param df_wsec: df, includes sector columns :param attr: dictionary, attribute data from method yaml for activity set :param method: dictionary, FBS method yaml :return: df, transformed fba dataframe with sector columns """ # load 2002 employment data bls = load_fba_w_standardized_units( datasource='BLS_QCEW', year='2002', flowclass='Employment', geographic_level='national', download_FBA_if_missing=kwargs['download_FBA_if_missing']) # clean df bls = clean_bls_qcew_fba(bls, attr=attr) # assign naics to allocation dataset bls_wsec = add_sectors_to_flowbyactivity( bls, sectorsourcename=method['target_sector_source']) # drop rows where sector = None ( does not occur with mining) bls_wsec = bls_wsec[~bls_wsec['SectorProducedBy'].isnull()] bls_wsec = bls_wsec.rename(columns={ 'SectorProducedBy': 'Sector', 'FlowAmount': 'HelperFlow' }) # check units before merge compare_df_units(df_wsec, bls_wsec) # merge the two dfs df = pd.merge(df_wsec, bls_wsec[['Location', 'Sector', 'HelperFlow']], how='left', left_on=['Location', 'SectorConsumedBy'], right_on=['Location', 'Sector']) # drop any rows where sector is None df = df[~df['Sector'].isnull()] # fill helperflow values with 0 df['HelperFlow'] = df['HelperFlow'].fillna(0) # calculate proportional ratios df_wratio = proportional_allocation_by_location_and_activity(df, 'Sector') df_wratio = df_wratio.rename(columns={ 'FlowAmountRatio': 'EmployeeRatio', 'HelperFlow': 'Employees' }) # drop rows where helperflow = 0 df_wratio = df_wratio[df_wratio['Employees'] != 0] # calculate gal/employee in 2002 df_wratio.loc[:, 'FlowAmount'] = \ (df_wratio['FlowAmount'] * df_wratio['EmployeeRatio']) / \ df_wratio['Employees'] df_wratio.loc[:, 'Unit'] = 'kg/p' # drop cols df_wratio = df_wratio.drop( columns=['Sector', 'Employees', 'EmployeeRatio']) return df_wratio
def allocate_usda_ers_mlu_land_in_rural_transportation_areas( df, attr, fbs_list): """ This function is used to allocate the USDA_ERS_MLU activity 'land in urban areas' to NAICS 2012 sectors. Allocation is dependent on assumptions defined in 'literature_values.py'. Methodology is based on the manuscript: Lin Zeng and Anu Ramaswami Impact of Locational Choices and Consumer Behaviors on Personal Land Footprints: An Exploration Across the Urban–Rural Continuum in the United States Environmental Science & Technology 2020 54 (6), 3091-3102 DOI: 10.1021/acs.est.9b06024 :param df: df, USDA ERA MLU Land :param attr: dictionary, attribute data from method yaml for activity set :param fbs_list: list, FBS dfs for activities created prior to the activity set that calls on this fxn :return: df, allocated USDS ERS MLU Land, FBS format """ # define sector column to base calculations sector_col = 'SectorConsumedBy' # load the federal highway administration fees dictionary fha_dict = get_transportation_sectors_based_on_FHA_fees() df_fha = pd.DataFrame.from_dict( fha_dict, orient='index').rename(columns={'NAICS_2012_Code': sector_col}) # make an assumption about the percent of rural transport area used by airports airport_multiplier = get_urban_land_use_for_airports() df_airport = df[df[sector_col] == '488119'] df_airport = df_airport.assign(FlowAmount=df_airport['FlowAmount'] * airport_multiplier) # make an assumption about the percent of urban transport area used by railroads railroad_multiplier = get_urban_land_use_for_railroads() df_railroad = df[df[sector_col] == '482112'] df_railroad = df_railroad.assign(FlowAmount=df_railroad['FlowAmount'] * railroad_multiplier) # further allocate the remaining urban transportation area # using Federal Highway Administration fees # first subtract area for airports and railroads air_rail_area = pd.concat([df_airport, df_railroad], sort=False) air_rail_area = air_rail_area[['Location', 'Unit', 'FlowAmount']] air_rail_area_sum = air_rail_area.groupby(['Location', 'Unit'], as_index=False)\ .agg({'FlowAmount': sum}).rename(columns={'FlowAmount': 'AirRail'}) # compare units compare_df_units(df, air_rail_area) df_highway = df.merge(air_rail_area_sum, how='left') df_highway = df_highway.assign(FlowAmount=df_highway['FlowAmount'] - df_highway['AirRail']) df_highway.drop(columns=['AirRail'], inplace=True) # add fed highway administration fees df_highway2 = df_highway.merge(df_fha, how='left') df_highway2 = df_highway2[df_highway2['ShareOfFees'].notna()] df_highway2 = df_highway2.assign(FlowAmount=df_highway2['FlowAmount'] * df_highway2['ShareOfFees']) df_highway2.drop(columns=['ShareOfFees'], inplace=True) # concat airport, railroad, highway allocated_rural_trans = pd.concat([df_airport, df_railroad, df_highway2], sort=False, ignore_index=True) return allocated_rural_trans
def allocate_usda_ers_mlu_land_in_urban_areas(df, attr, fbs_list): """ This function is used to allocate the USDA_ERS_MLU activity 'land in urban areas' to NAICS 2012 sectors. Allocation is dependent on assumptions defined in 'literature_values.py' as well as results from allocating 'EIA_CBECS_Land' and 'EIA_MECS_Land' to land based sectors. Methodology is based on the manuscript: Lin Zeng and Anu Ramaswami Impact of Locational Choices and Consumer Behaviors on Personal Land Footprints: An Exploration Across the Urban–Rural Continuum in the United States Environmental Science & Technology 2020 54 (6), 3091-3102 DOI: 10.1021/acs.est.9b06024 :param df: df, USDA ERA MLU Land :param attr: dictionary, attribute data from method yaml for activity set :param fbs_list: list, FBS dfs for activities created prior to the activity set that calls on this fxn :return: df, allocated USDS ERS MLU Land, FBS format """ # define sector column to base calculations sector_col = 'SectorConsumedBy' vLogDetailed.info('Assuming total land use from MECS and CBECS included ' 'in urban land area, so subtracting out calculated ' 'MECS and CBECS land from MLU urban land area') # read in the cbecs and mecs df from df_list for df_i in fbs_list: if (df_i['MetaSources'] == 'EIA_CBECS_Land').all(): cbecs = df_i elif (df_i['MetaSources'] == 'EIA_MECS_Land').all(): mecs = df_i # load the federal highway administration fees dictionary fha_dict = get_transportation_sectors_based_on_FHA_fees() df_fha = pd.DataFrame.from_dict( fha_dict, orient='index').rename(columns={'NAICS_2012_Code': sector_col}) # calculate total residential area from the American Housing Survey residential_land_area = get_area_of_urban_land_occupied_by_houses_2013() df_residential = df[df[sector_col] == 'F01000'] df_residential = df_residential.assign(FlowAmount=residential_land_area) # make an assumption about the percent of urban area that is open space openspace_multiplier = get_open_space_fraction_of_urban_area() df_openspace = df[df[sector_col] == '712190'] df_openspace = df_openspace.assign(FlowAmount=df_openspace['FlowAmount'] * openspace_multiplier) # sum all uses of urban area that are NOT transportation # first concat dfs for residential, openspace, commercial, and manufacturing land use df_non_urban_transport_area = pd.concat( [df_residential, df_openspace, cbecs, mecs], sort=False, ignore_index=True) df_non_urban_transport_area = df_non_urban_transport_area[[ 'Location', 'Unit', 'FlowAmount' ]] non_urban_transport_area_sum =\ df_non_urban_transport_area.groupby( ['Location', 'Unit'], as_index=False).agg({'FlowAmount': sum}).rename( columns={'FlowAmount': 'NonTransport'}) # compare units compare_df_units(df, df_non_urban_transport_area) # calculate total urban transportation by subtracting calculated areas from total urban land df_transport = df.merge(non_urban_transport_area_sum, how='left') df_transport = df_transport.assign(FlowAmount=df_transport['FlowAmount'] - df_transport['NonTransport']) df_transport.drop(columns=['NonTransport'], inplace=True) # make an assumption about the percent of urban transport area used by airports airport_multiplier = get_urban_land_use_for_airports() df_airport = df_transport[df_transport[sector_col] == '488119'] df_airport = df_airport.assign(FlowAmount=df_airport['FlowAmount'] * airport_multiplier) # make an assumption about the percent of urban transport area used by railroads railroad_multiplier = get_urban_land_use_for_railroads() df_railroad = df_transport[df_transport[sector_col] == '482112'] df_railroad = df_railroad.assign(FlowAmount=df_railroad['FlowAmount'] * railroad_multiplier) # further allocate the remaining urban transportation area using # Federal Highway Administration fees # first subtract area for airports and railroads air_rail_area = pd.concat([df_airport, df_railroad], sort=False) air_rail_area = air_rail_area[['Location', 'Unit', 'FlowAmount']] air_rail_area_sum = air_rail_area.groupby(['Location', 'Unit'], as_index=False)\ .agg({'FlowAmount': sum}).rename(columns={'FlowAmount': 'AirRail'}) df_highway = df_transport.merge(air_rail_area_sum, how='left') df_highway = df_highway.assign(FlowAmount=df_highway['FlowAmount'] - df_highway['AirRail']) df_highway.drop(columns=['AirRail'], inplace=True) # add fed highway administration fees df_highway2 = df_highway.merge(df_fha, how='left') df_highway2 = df_highway2[df_highway2['ShareOfFees'].notna()] df_highway2 = df_highway2.assign(FlowAmount=df_highway2['FlowAmount'] * df_highway2['ShareOfFees']) df_highway2.drop(columns=['ShareOfFees'], inplace=True) # concat all df subsets allocated_urban_areas_df = pd.concat( [df_residential, df_openspace, df_airport, df_railroad, df_highway2], ignore_index=True, sort=False).reset_index(drop=True) return allocated_urban_areas_df
def check_golf_and_crop_irrigation_totals(df_load): """ Check that golf + crop values equal published irrigation totals. If not, assign water to crop irrigation. :param df_load: df, USGS water use :return: df, FBA with reassigned irrigation water to crop and golf """ # drop national data df = df_load[df_load['Location'] != '00000'] # subset into golf, crop, and total irrigation (and non irrigation) df_i = df[(df[fba_activity_fields[0]] == 'Irrigation') | (df[fba_activity_fields[1]] == 'Irrigation')] df_g = df[(df[fba_activity_fields[0]] == 'Irrigation Golf Courses') | (df[fba_activity_fields[1]] == 'Irrigation Golf Courses')] df_c = df[(df[fba_activity_fields[0]] == 'Irrigation Crop') | (df[fba_activity_fields[1]] == 'Irrigation Crop')] # unit check compare_df_units(df_i, df_g) # merge the golf and total irrigation into crop df and # modify crop FlowAmounts if necessary df_m = pd.merge(df_i, df_g[[ 'FlowName', 'FlowAmount', 'ActivityProducedBy', 'ActivityConsumedBy', 'Compartment', 'Location', 'Year' ]], how='outer', right_on=['FlowName', 'Compartment', 'Location', 'Year'], left_on=['FlowName', 'Compartment', 'Location', 'Year']) df_m = df_m.rename( columns={ "FlowAmount_x": "FlowAmount", "ActivityProducedBy_x": "ActivityProducedBy", "ActivityConsumedBy_x": "ActivityConsumedBy", "FlowAmount_y": "Golf_Amount", "ActivityProducedBy_y": "Golf_APB", "ActivityConsumedBy_y": "Golf_ACB", }) compare_df_units(df_m, df_c) df_m2 = pd.merge(df_m, df_c[[ 'FlowName', 'FlowAmount', 'ActivityProducedBy', 'ActivityConsumedBy', 'Compartment', 'Location', 'Year' ]], how='outer', right_on=['FlowName', 'Compartment', 'Location', 'Year'], left_on=['FlowName', 'Compartment', 'Location', 'Year']) df_m2 = df_m2.rename( columns={ "FlowAmount_x": "FlowAmount", "ActivityProducedBy_x": "ActivityProducedBy", "ActivityConsumedBy_x": "ActivityConsumedBy", "FlowAmount_y": "Crop_Amount", "ActivityProducedBy_y": "Crop_APB", "ActivityConsumedBy_y": "Crop_ACB" }) # fill na and sum crop and golf # df_m2 = df_m2.fillna(0) df_m2['subset_sum'] = df_m2['Crop_Amount'] + df_m2['Golf_Amount'] df_m2['Diff'] = df_m2['FlowAmount'] - df_m2['subset_sum'] df_m3 = df_m2[df_m2['Diff'] >= 0.000001].reset_index(drop=True) # rename irrigation to irrgation crop and append rows to df df_m3.loc[df_m3['ActivityProducedBy'] == 'Irrigation', 'ActivityProducedBy'] = 'Irrigation Crop' df_m3.loc[df_m3['ActivityConsumedBy'] == 'Irrigation', 'ActivityConsumedBy'] = 'Irrigation Crop' df_m3 = df_m3.drop(columns=[ 'Golf_Amount', 'Golf_APB', 'Golf_ACB', 'Crop_Amount', 'Crop_APB', 'Crop_ACB', 'subset_sum', 'Diff' ]) if len(df_m3) != 0: df_w_missing_crop = df_load.append(df_m3, sort=True, ignore_index=True) return df_w_missing_crop else: return df_load
def calculate_net_public_supply(df_load): """ USGS Provides info on the quantity of public supply withdrawals that are delivered to domestic use. The USGS PS withdrawals are not necessarily greater than/equal to the Domestic deliveries because water can be withdrawn in one county and delivered in another (water can also cross state lines). Therefore, can/do end up with NEGATIVE net public supply values and PS water should only be used at a national level Domestic deliveries are subtracted from public supply. An assumption is made that PS deliveries to domestic is fresh water. The national level data can then be allocated to end users using the BEA Use tables. :param df_load: USGS df :return: df with net public supply values """ # subset into 2 dfs, one that contains PS data and one that does not df1 = df_load[(df_load[fba_activity_fields[0]] == 'Public Supply') | (df_load[fba_activity_fields[1]] == 'Public Supply')] df2 = df_load[(df_load[fba_activity_fields[0]] != 'Public Supply') & (df_load[fba_activity_fields[1]] != 'Public Supply')] # drop all deliveries to thermo and industrial # (not enough states report the data to make usable) df1_sub = df1[~df1[fba_activity_fields[1]].isin([ 'Industrial', 'Thermoelectric Power', 'Thermoelectric Power Closed-loop cooling', 'Thermoelectric Power Once-through cooling' ])] # drop duplicate info of "Public Supply deliveries to" df1_sub = df1_sub.loc[~df1_sub['Description'].str. contains("Public Supply total deliveries")] df1_sub = df1_sub.loc[~df1_sub['Description'].str. contains("deliveries from public supply")] # calculate data drop vLogDetailed.info('Dropping rows that contain "deliveries from public ' 'supply" to avoid double counting with rows of "Public ' 'Supply deliveries to"') calculate_flowamount_diff_between_dfs(df1, df1_sub) # drop county level values because cannot use county data vLogDetailed.info('Dropping county level public supply withdrawals ' 'because will end up with negative values due to ' 'instances of water deliveries coming from surrounding ' 'counties') df1_sub = df1_sub[df1_sub['Location'].apply( lambda x: x[2:6] == '000')].reset_index(drop=True) # df of ps delivered and ps withdrawn and us total df_d = df1_sub[df1_sub[fba_activity_fields[0]] == 'Public Supply'] df_w = df1_sub[df1_sub[fba_activity_fields[1]] == 'Public Supply'] df_us = df1_sub[df1_sub['Location'] == '00000'] # split consumed further into fresh water (assumption domestic # deliveries are freshwater) assumption that water withdrawal taken # equally from ground and surface df_w1 = df_w[(df_w['FlowName'] == 'fresh') & (df_w['Compartment'] != 'total')] df_w2 = df_w[(df_w['FlowName'] == 'fresh') & (df_w['Compartment'] == 'total')] # compare units compare_df_units(df_w1, df_w2) df_wm = pd.merge(df_w1, df_w2[['FlowAmount', 'Location', 'Unit']], how='left', left_on=['Location', 'Unit'], right_on=['Location', 'Unit']) df_wm = df_wm.rename(columns={ "FlowAmount_x": "FlowAmount", "FlowAmount_y": "FlowTotal" }) # compare units compare_df_units(df_wm, df_d) # merge the deliveries to domestic df_w_modified = pd.merge(df_wm, df_d[['FlowAmount', 'Location']], how='left', left_on='Location', right_on='Location') df_w_modified = df_w_modified.rename(columns={ "FlowAmount_x": "FlowAmount", "FlowAmount_y": "DomesticDeliveries" }) # create flowratio for ground/surface df_w_modified.loc[:, 'FlowRatio'] = \ df_w_modified['FlowAmount'] / df_w_modified['FlowTotal'] # calculate new, net total public supply withdrawals # will end up with negative values due to instances of water # deliveries coming form surrounding counties df_w_modified.loc[:, 'FlowAmount'] = \ df_w_modified['FlowAmount'] - (df_w_modified['FlowRatio'] * df_w_modified['DomesticDeliveries']) net_ps = df_w_modified.drop(columns=["FlowTotal", "DomesticDeliveries"]) # compare units compare_df_units(df_d, net_ps) # because assuming domestic is all fresh, drop # flowname/flowable/Compartment/context # and instead use those column data from the net_ps df df_d_modified = df_d.drop( columns=['FlowName', 'Flowable', 'Compartment', 'Context', 'FlowUUID']) # Also allocate to ground/surface from state ratios df_d_modified = pd.merge(df_d_modified, net_ps[[ 'FlowName', 'Flowable', 'Compartment', 'Context', 'FlowUUID', 'Location', 'FlowRatio' ]], how='left', left_on='Location', right_on='Location') df_d_modified.loc[:, 'FlowAmount'] = \ df_d_modified['FlowAmount'] * df_d_modified['FlowRatio'] df_d_modified = df_d_modified.drop(columns=["FlowRatio"]) net_ps = net_ps.drop(columns=["FlowRatio"]) # concat dfs back (non-public supply, public supply # deliveries, net ps withdrawals) modified_ps = pd.concat([df2, df_d_modified, net_ps, df_us], ignore_index=True) return modified_ps