def calculate_floorspace_based_on_number_of_floors(fba): """ Estimate total floorspace for each building type based on data on the number of floors for each building type. Assumptions (Taken from Yang's static satellite tables): 1. When floor range is 4-9, assume 6 stories 2. When floor range is 10 or more, assume 15 stories :param fba: :return: """ # disaggregate mercentile to malls and non malls fba = disaggregate_eia_cbecs_mercentile(fba) # disaggregate other and vacant fba = disaggregate_eia_cbecs_vacant_and_other(fba) # drop data for 'all buildings' fba = fba[fba['Description'] != 'All buildings'] # add column 'DivisionFactor' based on description fba = fba.assign(DivisionFactor=fba['Description'].apply(lambda x: ( 1 if 'One' in x else (2 if 'Two' in x else (3 if 'Three' in x else ( 6 if 'Four' in x else (15 if 'Ten' in x else ""))))))) # modify flowamounts to represent building footprint rather than total floorspace fba['FlowAmount'] = fba['FlowAmount'] / fba['DivisionFactor'] # sum values for single flowamount for each bulding type groupbycols = fba_default_grouping_fields fba2 = aggregator(fba, groupbycols) # add description fba2 = fba2.assign(Description='Building Footprint') return fba2
def calculate_floorspace_based_on_number_of_floors(fba_load): """ Estimate total floorspace for each building type based on data on the number of floors for each building type. Assumptions (Taken from Yang's static satellite tables): 1. When floor range is 4-9, assume 6 stories 2. When floor range is 10 or more, assume 15 stories :param fba_load: df, eia cbecs land flowbyactivity :return: df, eia cbecs land fba with estimated total floorspace """ # disaggregate mercentile to malls and non malls fba = disaggregate_eia_cbecs_mercentile(fba_load) vLogDetailed.info('Calculate floorspace for mall and nonmall buildings ' 'with different number of floors. Once calculated, ' 'drop mercantile data from dataframe to avoid double ' 'counting.') calculate_flowamount_diff_between_dfs(fba_load, fba) # disaggregate other and vacant fba2 = disaggregate_eia_cbecs_vacant_and_other(fba) vLogDetailed.info('Due to data suppression for floorspace by building ' 'number of floors, some data is lost when dropping ' 'floorspace for all buildings within a principle ' 'building activity. To avoid this data loss, all ' 'remaining floorspace for "All buildings" by number of ' 'floors is allocated to "Vacant" and "Other" principle ' 'building activities, as these activities are allocated ' 'to all commercial building sectors. This assumption ' 'results in a total floorspace increase for "Vacant" ' 'and "Other" activities.') calculate_flowamount_diff_between_dfs(fba, fba2) # drop data for 'all buildings' fba3 = fba2[fba2['Description'] != 'All buildings'] # add column 'DivisionFactor' based on description fba3 = fba3.assign(DivisionFactor=fba3['Description'].apply(lambda x: ( 1 if 'One' in x else (2 if 'Two' in x else (3 if 'Three' in x else ( 6 if 'Four' in x else (15 if 'Ten' in x else ""))))))) # modify flowamounts to represent building footprint rather than # total floorspace fba3['FlowAmount'] = fba3['FlowAmount'] / fba3['DivisionFactor'] # sum values for single flowamount for each bulding type vLogDetailed.info('Drop flows for "All Buildings" to avoid double ' 'counting, as maintain floorspace by buildings based ' 'on number of floors. Also dividing total floorspace ' 'by number of floors to calculate a building footprint. ' 'Calculates result in reduced FlowAmount for all ' 'categories.') calculate_flowamount_diff_between_dfs(fba2, fba3) # rename the FlowAmounts and sum so total floorspace, rather than have # multiple rows based on floors fba3 = fba3.assign( FlowName=fba3['FlowName'].apply(lambda x: ','.join(x.split(',')[:-1]))) # modify the description fba3 = fba3.assign(Description='Building Footprint') groupbycols = fba_mapped_default_grouping_fields fba4 = aggregator(fba3, groupbycols) return fba4
def replace_missing_2_digit_sector_values(df): """ In the 2015 (and possibly other dfs, there are instances of values at the 3 digit NAICS level, while the 2 digit NAICS is reported as 0. The 0 values are replaced with summed 3 digit NAICS :param df: df, BLS QCEW data in FBA format :return: df, BLS QCEW data with 2-digit NAICS sector FlowAmounts """ # check for 2 digit 0 values df_missing = df[(df['ActivityProducedBy'].apply(lambda x: len(x) == 2)) & (df['FlowAmount'] == 0)] # create list of location/activityproduced by combos missing_sectors = df_missing[['Location', 'ActivityProducedBy' ]].drop_duplicates().values.tolist() # subset the df to 3 naics where flow amount is not 0 and # that would sum to the missing 2 digit naics df_subset = df[df['ActivityProducedBy'].apply(lambda x: len(x) == 3) & (df['FlowAmount'] != 0)] new_sectors_list = [] for q, r in missing_sectors: c1 = df_subset['Location'] == q c2 = df_subset['ActivityProducedBy'].apply(lambda x: x[0:2] == r) # subset data new_sectors_list.append(df_subset[c1 & c2]) if len(new_sectors_list) != 0: new_sectors = pd.concat(new_sectors_list, sort=False, ignore_index=True) # drop last digit of naics and aggregate new_sectors.loc[:, 'ActivityProducedBy'] = \ new_sectors['ActivityProducedBy'].apply(lambda x: x[0:2]) new_sectors = aggregator(new_sectors, fba_default_grouping_fields) # drop the old location/activity columns in the bls df and # add new sector values new_sectors_list = \ new_sectors[['Location', 'ActivityProducedBy' ]].drop_duplicates().values.tolist() # rows to drop rows_list = [] for q, r in new_sectors_list: c1 = df['Location'] == q c2 = df['ActivityProducedBy'].apply(lambda x: x == r) # subset data rows_list.append(df[(c1 & c2)]) rows_to_drop = pd.concat(rows_list, ignore_index=True) # drop rows from df modified_df = pd.merge(df, rows_to_drop, indicator=True, how='outer').query('_merge=="left_only"').drop( '_merge', axis=1) # add new rows modified_df = modified_df.append(new_sectors, sort=False) return modified_df else: return df
def compare_geographic_totals(df_subset, df_load, sourcename, attr, activity_set, activity_names): """ Check for any data loss between the geoscale used and published national data :param df_subset: df, after subset by geography :param df_load: df, loaded data, including published national data :param sourcename: str, source name :param attr: dictionary, attributes :param activity_set: str, activity set :param activity_names: list of names in the activity set by which to subset national level data :return: df, comparing published national level data to df subset """ # subset df_load to national level nat = df_load[df_load['Location'] == US_FIPS].reset_index( drop=True).rename(columns={'FlowAmount': 'FlowAmount_nat'}) # if df len is not 0, continue with comparison if len(nat) != 0: # subset national level data by activity set names nat = nat[(nat[fba_activity_fields[0]].isin(activity_names)) | ( nat[fba_activity_fields[1]].isin(activity_names))].reset_index( drop=True) nat = replace_strings_with_NoneType(nat) # drop the geoscale in df_subset and sum sub = df_subset.assign(Location=US_FIPS) # depending on the datasource, might need to rename some # strings for national comparison sub = rename_column_values_for_comparison(sub, sourcename) sub2 = aggregator(sub, fba_default_grouping_fields).rename( columns={'FlowAmount': 'FlowAmount_sub'}) # compare df merge_cols = [ 'Class', 'SourceName', 'FlowName', 'Unit', 'FlowType', 'ActivityProducedBy', 'ActivityConsumedBy', 'Compartment', 'Location', 'LocationSystem', 'Year' ] # comapare units compare_df_units(nat, sub2) df_m = pd.merge(nat[merge_cols + ['FlowAmount_nat']], sub2[merge_cols + ['FlowAmount_sub']], how='outer') df_m = df_m.assign(FlowAmount_diff=df_m['FlowAmount_nat'] - df_m['FlowAmount_sub']) df_m = df_m.assign(Percent_Diff=( abs(df_m['FlowAmount_diff'] / df_m['FlowAmount_nat']) * 100)) df_m = df_m[df_m['FlowAmount_diff'] != 0].reset_index(drop=True) # subset the merged df to what to include in the validation df # include data where percent difference is > 1 or where value is nan df_m_sub = df_m[(df_m['Percent_Diff'] > 1) | (df_m['Percent_Diff'].isna())].reset_index(drop=True) if len(df_m_sub) == 0: vLog.info( 'No data loss greater than 1%% between national ' 'level data and %s subset', attr['allocation_from_scale']) else: vLog.info( 'There are data differences between published national' ' values and %s subset, saving to validation log', attr['allocation_from_scale']) vLogDetailed.info( 'Comparison of National FlowAmounts to aggregated data ' 'subset for %s: \n {}'.format(df_m_sub.to_string()), activity_set)
def replace_naics_w_naics_from_another_year(df_load, sectorsourcename): """ Replace any non sectors with sectors. :param df_load: df with sector columns or sector-like activities :param sectorsourcename: str, sector source name (ex. NAICS_2012_Code) :return: df, with non-sectors replaced with sectors """ # from flowsa.flowbyfunctions import aggregator # drop NoneType df = replace_NoneType_with_empty_cells(df_load).reset_index(drop=True) # load the mastercroswalk and subset by sectorsourcename, # save values to list cw_load = load_crosswalk('sector_timeseries') cw = cw_load[sectorsourcename].drop_duplicates().tolist() # load melted crosswalk cw_melt = melt_naics_crosswalk() # drop the count column cw_melt = cw_melt.drop(columns='naics_count') # determine which headers are in the df if 'SectorConsumedBy' in df: column_headers = ['SectorProducedBy', 'SectorConsumedBy'] else: column_headers = ['ActivityProducedBy', 'ActivityConsumedBy'] # check if there are any sectors that are not in the naics 2012 crosswalk non_naics = check_if_sectors_are_naics(df, cw, column_headers) # loop through the df headers and determine if value is # not in crosswalk list if len(non_naics) != 0: vLog.debug( 'Checking if sectors represent a different ' 'NAICS year, if so, replace with %s', sectorsourcename) for c in column_headers: # merge df with the melted sector crosswalk df = df.merge(cw_melt, left_on=c, right_on='NAICS', how='left') # if there is a value in the sectorsourcename column, # use that value to replace sector in column c if value in # column c is in the non_naics list df[c] = np.where((df[c] == df['NAICS']) & (df[c].isin(non_naics)), df[sectorsourcename], df[c]) # multiply the FlowAmount col by allocation_ratio df.loc[df[c] == df[sectorsourcename], 'FlowAmount'] = df['FlowAmount'] * df['allocation_ratio'] # drop columns df = df.drop( columns=[sectorsourcename, 'NAICS', 'allocation_ratio']) vLog.debug('Replaced NAICS with %s', sectorsourcename) # check if there are any sectors that are not in # the naics 2012 crosswalk vLog.debug('Check again for non NAICS 2012 Codes') nonsectors = check_if_sectors_are_naics(df, cw, column_headers) if len(nonsectors) != 0: vLog.debug('Dropping non-NAICS from dataframe') for c in column_headers: # drop rows where column value is in the nonnaics list df = df[~df[c].isin(nonsectors)] # aggregate data possible_column_headers = \ ('FlowAmount', 'Spread', 'Min', 'Max', 'DataReliability', 'TemporalCorrelation', 'GeographicalCorrelation', 'TechnologicalCorrelation', 'DataCollection', 'Description') # list of column headers to group aggregation by groupby_cols = [ e for e in df.columns.values.tolist() if e not in possible_column_headers ] df = aggregator(df, groupby_cols) # drop rows where both SectorConsumedBy and SectorProducedBy NoneType if 'SectorConsumedBy' in df: df_drop = df[(df['SectorConsumedBy'].isnull()) & (df['SectorProducedBy'].isnull())] if len(df_drop) != 0: activities_dropped = pd.unique( df_drop[['ActivityConsumedBy', 'ActivityProducedBy']].values.ravel('K')) activities_dropped = list( filter(lambda x: x is not None, activities_dropped)) vLog.debug('Dropping rows where the Activity columns contain %s', ', '.join(activities_dropped)) df = df[~((df['SectorConsumedBy'].isnull()) & (df['SectorProducedBy'].isnull()))].reset_index(drop=True) else: df = df[~((df['ActivityConsumedBy'].isnull()) & (df['ActivityProducedBy'].isnull()))].reset_index(drop=True) df = replace_strings_with_NoneType(df) return df
def compare_fba_geo_subset_and_fbs_output_totals(fba_load, fbs_load, activity_set, source_name, source_attr, activity_attr, method): """ Function to compare the loaded flowbyactivity total after subsetting by activity and geography with the final flowbysector output total. Not a direct comparison of the loaded FBA because FBAs are modified before being subset by activity for the target sector level :param fba_load: df, FBA loaded, before being mapped :param fbs_load: df, final FBS df at target sector level :param activity_set: str, activity set :param source_name: str, source name :param source_attr: dictionary, attribute data from method yaml for source data :param activity_attr: dictionary, attribute data from method yaml for activity set :param method: dictionary, FBS method yaml :return: printout data differences between loaded FBA and FBS output totals by location, save results as csv in local directory """ vLog.info('Comparing Flow-By-Activity subset by activity and geography to ' 'the subset Flow-By-Sector FlowAmount total.') # determine from scale if fips_number_key[source_attr['geoscale_to_use']] < \ fips_number_key[activity_attr['allocation_from_scale']]: from_scale = source_attr['geoscale_to_use'] else: from_scale = activity_attr['allocation_from_scale'] # extract relevant geoscale data or aggregate existing data fba = subset_df_by_geoscale(fba_load, from_scale, method['target_geoscale']) if check_activities_sector_like(source_name): # if activities are sector-like, run sector aggregation and then # subset df to only keep NAICS2 fba = fba[[ 'Class', 'FlowAmount', 'Unit', 'Context', 'ActivityProducedBy', 'ActivityConsumedBy', 'Location', 'LocationSystem' ]] # rename the activity cols to sector cols for purposes of aggregation fba = fba.rename( columns={ 'ActivityProducedBy': 'SectorProducedBy', 'ActivityConsumedBy': 'SectorConsumedBy' }) group_cols_agg = [ 'Class', 'Context', 'Unit', 'Location', 'LocationSystem', 'SectorProducedBy', 'SectorConsumedBy' ] fba = sector_aggregation(fba, group_cols_agg) # subset fba to only include NAICS2 fba = replace_NoneType_with_empty_cells(fba) fba = fba[fba['SectorConsumedBy'].apply(lambda x: len(x) == 2) | fba['SectorProducedBy'].apply(lambda x: len(x) == 2)] # subset/agg dfs col_subset = [ 'Class', 'FlowAmount', 'Unit', 'Context', 'Location', 'LocationSystem' ] group_cols = ['Class', 'Unit', 'Context', 'Location', 'LocationSystem'] # check units compare_df_units(fba, fbs_load) # fba fba = fba[col_subset] fba_agg = aggregator(fba, group_cols).reset_index(drop=True) fba_agg.rename(columns={ 'FlowAmount': 'FBA_amount', 'Unit': 'FBA_unit' }, inplace=True) # fbs fbs = fbs_load[col_subset] fbs_agg = aggregator(fbs, group_cols) fbs_agg.rename(columns={ 'FlowAmount': 'FBS_amount', 'Unit': 'FBS_unit' }, inplace=True) try: # merge FBA and FBS totals df_merge = fba_agg.merge(fbs_agg, how='left') df_merge['FlowAmount_difference'] = \ df_merge['FBA_amount'] - df_merge['FBS_amount'] df_merge['Percent_difference'] = \ (df_merge['FlowAmount_difference']/df_merge['FBA_amount']) * 100 # reorder df_merge = df_merge[[ 'Class', 'Context', 'Location', 'LocationSystem', 'FBA_amount', 'FBA_unit', 'FBS_amount', 'FBS_unit', 'FlowAmount_difference', 'Percent_difference' ]] df_merge = replace_NoneType_with_empty_cells(df_merge) # list of contexts and locations context_list = df_merge[['Context', 'Location']].values.tolist() # loop through the contexts and print results of comparison vLog.info( 'Comparing FBA %s %s subset to FBS results. ' 'Details in Validation Log', activity_set, source_attr['geoscale_to_use']) for i, j in context_list: df_merge_subset = \ df_merge[(df_merge['Context'] == i) & (df_merge['Location'] == j)].reset_index(drop=True) diff_per = df_merge_subset['Percent_difference'][0] if np.isnan(diff_per): vLog.info( 'FlowBySector FlowAmount for %s %s %s ' 'does not exist in the FBS', source_name, activity_set, i) continue # make reporting more manageable if abs(diff_per) > 0.01: diff_per = round(diff_per, 2) else: diff_per = round(diff_per, 6) # diff_units = df_merge_subset['FBS_unit'][0] if diff_per > 0: vLog.info( 'FlowBySector FlowAmount for %s %s %s at %s is %s%% ' 'less than the FlowByActivity FlowAmount', source_name, activity_set, i, j, str(abs(diff_per))) elif diff_per < 0: vLog.info( 'FlowBySector FlowAmount for %s %s %s at %s is %s%% ' 'more than the FlowByActivity FlowAmount', source_name, activity_set, i, j, str(abs(diff_per))) elif diff_per == 0: vLogDetailed.info( 'FlowBySector FlowAmount for ' '%s %s %s at %s is equal to the ' 'FlowByActivity FlowAmount', source_name, activity_set, i, j) # subset the df to include in the validation log # only print rows where the percent difference does not round to 0 df_v = df_merge[df_merge['Percent_difference'].apply( lambda x: round(x, 3) != 0)].reset_index(drop=True) # log output log.info( 'Save the comparison of FlowByActivity load to FlowBySector ' 'total FlowAmounts for %s in validation log file', activity_set) # if df not empty, print, if empty, print string if df_v.empty: vLogDetailed.info('Percent difference for %s all round to 0', activity_set) else: vLogDetailed.info( 'Comparison of FBA load to FBS total ' 'FlowAmounts for %s: ' '\n {}'.format(df_v.to_string()), activity_set) except: vLog.info('Error occurred when comparing total FlowAmounts ' 'for FlowByActivity and FlowBySector')
def compare_activity_to_sector_flowamounts(fba_load, fbs_load, activity_set, source_name, config): """ Function to compare the loaded flowbyactivity with the final flowbysector by activityname (if exists) to target sector level output, checking for data loss :param fba_load: df, FBA loaded and mapped using FEDEFL :param fbs_load: df, final FBS df :param activity_set: str, activity set :param source_name: str, source name :param config: dictionary, method yaml :return: printout data differences between loaded FBA and FBS output, save results as csv in local directory """ if check_activities_sector_like(source_name): vLog.debug('Not comparing loaded FlowByActivity to FlowBySector ' 'ratios for a dataset with sector-like activities because ' 'if there are modifications to flowamounts for a sector, ' 'then the ratios will be different') else: # subset fba df fba = fba_load[[ 'Class', 'MetaSources', 'Flowable', 'Unit', 'FlowType', 'ActivityProducedBy', 'ActivityConsumedBy', 'Context', 'Location', 'LocationSystem', 'Year', 'FlowAmount' ]].drop_duplicates().reset_index(drop=True) fba.loc[:, 'Location'] = US_FIPS group_cols = [ 'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit', 'FlowType', 'Context', 'Location', 'LocationSystem', 'Year' ] fba_agg = aggregator(fba, group_cols) fba_agg.rename(columns={'FlowAmount': 'FBA_amount'}, inplace=True) # subset fbs df fbs = fbs_load[[ 'Class', 'SectorSourceName', 'Flowable', 'Unit', 'FlowType', 'SectorProducedBy', 'SectorConsumedBy', 'ActivityProducedBy', 'ActivityConsumedBy', 'Context', 'Location', 'LocationSystem', 'Year', 'FlowAmount' ]].drop_duplicates().reset_index(drop=True) fbs = replace_NoneType_with_empty_cells(fbs) fbs['ProducedLength'] = fbs['SectorProducedBy'].str.len() fbs['ConsumedLength'] = fbs['SectorConsumedBy'].str.len() fbs['SectorLength'] = fbs[['ProducedLength', 'ConsumedLength']].max(axis=1) fbs.loc[:, 'Location'] = US_FIPS group_cols = [ 'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit', 'FlowType', 'Context', 'Location', 'LocationSystem', 'Year', 'SectorLength' ] fbs_agg = aggregator(fbs, group_cols) fbs_agg.rename(columns={'FlowAmount': 'FBS_amount'}, inplace=True) # merge compare 1 and compare 2 df_merge = fba_agg.merge(fbs_agg, left_on=[ 'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit', 'FlowType', 'Context', 'Location', 'LocationSystem', 'Year' ], right_on=[ 'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit', 'FlowType', 'Context', 'Location', 'LocationSystem', 'Year' ], how='left') df_merge['Ratio'] = df_merge['FBS_amount'] / df_merge['FBA_amount'] # reorder df_merge = df_merge[[ 'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit', 'FlowType', 'Context', 'Location', 'LocationSystem', 'Year', 'SectorLength', 'FBA_amount', 'FBS_amount', 'Ratio' ]] # keep onlyrows of specified sector length comparison = df_merge[df_merge['SectorLength'] == sector_level_key[ config['target_sector_level']]].reset_index(drop=True) tolerance = 0.01 comparison2 = comparison[(comparison['Ratio'] < 1 - tolerance) | (comparison['Ratio'] > 1 + tolerance)] if len(comparison2) > 0: vLog.info( 'There are %s combinations of flowable/context/sector ' 'length where the flowbyactivity to flowbysector ratio ' 'is less than or greater than 1 by %s', len(comparison2), str(tolerance)) # include df subset in the validation log # only print rows where flowamount ratio is less t # han 1 (round flowamountratio) df_v = comparison2[comparison2['Ratio'].apply( lambda x: round(x, 3) < 1)].reset_index(drop=True) # save to validation log log.info( 'Save the comparison of FlowByActivity load ' 'to FlowBySector ratios for %s in validation log', activity_set) # if df not empty, print, if empty, print string if df_v.empty: vLogDetailed.info('Ratios for %s all round to 1', activity_set) else: vLogDetailed.info( 'Comparison of FlowByActivity load to ' 'FlowBySector ratios for %s: ' '\n {}'.format(df_v.to_string()), activity_set)
def convert_statcan_data_to_US_water_use(df, attr): """ Use Canadian GDP data to convert 3 digit canadian water use to us water use: - canadian gdp - us gdp :param df: df, FBA format :param attr: dictionary, attribute data from method yaml for activity set :return: df, FBA format, flowamounts converted """ # load Canadian GDP data gdp = load_fba_w_standardized_units(datasource='StatCan_GDP', year=attr['allocation_source_year'], flowclass='Money') # drop 31-33 gdp = gdp[gdp['ActivityProducedBy'] != '31-33'] gdp = gdp.rename(columns={"FlowAmount": "CanDollar"}) # check units before merge compare_df_units(df, gdp) # merge df df_m = pd.merge(df, gdp[['CanDollar', 'ActivityProducedBy']], how='left', left_on='ActivityConsumedBy', right_on='ActivityProducedBy') df_m['CanDollar'] = df_m['CanDollar'].fillna(0) df_m = df_m.drop(columns=["ActivityProducedBy_y"]) df_m = df_m.rename(columns={"ActivityProducedBy_x": "ActivityProducedBy"}) df_m = df_m[df_m['CanDollar'] != 0] exchange_rate = get_Canadian_to_USD_exchange_rate( str(attr['allocation_source_year'])) exchange_rate = float(exchange_rate) # convert to mgal/USD df_m.loc[:, 'FlowAmount'] = df_m['FlowAmount'] / (df_m['CanDollar'] / exchange_rate) df_m.loc[:, 'Unit'] = 'Mgal/USD' df_m = df_m.drop(columns=["CanDollar"]) # convert Location to US df_m.loc[:, 'Location'] = US_FIPS df_m = assign_fips_location_system(df_m, str(attr['allocation_source_year'])) # load us gdp # load Canadian GDP data us_gdp_load = load_fba_w_standardized_units( datasource='BEA_GDP_GrossOutput', year=attr['allocation_source_year'], flowclass='Money') # load bea crosswalk cw_load = load_bea_crosswalk() cw = cw_load[['BEA_2012_Detail_Code', 'NAICS_2012_Code']].drop_duplicates() cw = cw[cw['NAICS_2012_Code'].apply( lambda x: len(str(x)) == 3)].drop_duplicates().reset_index(drop=True) # merge us_gdp = pd.merge(us_gdp_load, cw, how='left', left_on='ActivityProducedBy', right_on='BEA_2012_Detail_Code') us_gdp = us_gdp.drop( columns=['ActivityProducedBy', 'BEA_2012_Detail_Code']) # rename columns us_gdp = us_gdp.rename(columns={'NAICS_2012_Code': 'ActivityProducedBy'}) # agg by naics us_gdp = aggregator(us_gdp, fba_default_grouping_fields) us_gdp = us_gdp.rename(columns={'FlowAmount': 'us_gdp'}) # determine annual us water use df_m2 = pd.merge(df_m, us_gdp[['ActivityProducedBy', 'us_gdp']], how='left', left_on='ActivityConsumedBy', right_on='ActivityProducedBy') df_m2.loc[:, 'FlowAmount'] = df_m2['FlowAmount'] * (df_m2['us_gdp']) df_m2.loc[:, 'Unit'] = 'Mgal' df_m2 = df_m2.rename( columns={'ActivityProducedBy_x': 'ActivityProducedBy'}) df_m2 = df_m2.drop(columns=['ActivityProducedBy_y', 'us_gdp']) return df_m2
def main(method_name): """ Creates a flowbysector dataset :param method_name: Name of method corresponding to flowbysector method yaml name :return: flowbysector """ log.info("Initiating flowbysector creation for " + method_name) # call on method method = load_method(method_name) # create dictionary of data and allocation datasets fb = method['source_names'] # Create empty list for storing fbs files fbs_list = [] for k, v in fb.items(): # pull fba data for allocation flows = load_source_dataframe(k, v) if v['data_format'] == 'FBA': # ensure correct datatypes and that all fields exist flows = clean_df(flows, flow_by_activity_fields, fba_fill_na_dict, drop_description=False) # clean up fba, if specified in yaml if v["clean_fba_df_fxn"] != 'None': log.info("Cleaning up " + k + " FlowByActivity") flows = getattr(sys.modules[__name__], v["clean_fba_df_fxn"])(flows) # if activity_sets are specified in a file, call them here if 'activity_set_file' in v: aset_names = pd.read_csv(flowbysectoractivitysetspath + v['activity_set_file'], dtype=str) # create dictionary of allocation datasets for different activities activities = v['activity_sets'] # subset activity data and allocate to sector for aset, attr in activities.items(): # subset by named activities if 'activity_set_file' in v: names = aset_names[aset_names['activity_set'] == aset]['name'] else: names = attr['names'] log.info("Preparing to handle subset of flownames " + ', '.join(map(str, names)) + " in " + k) # subset fba data by activity flows_subset = flows[ (flows[fba_activity_fields[0]].isin(names)) | (flows[fba_activity_fields[1]].isin(names))].reset_index( drop=True) # extract relevant geoscale data or aggregate existing data log.info("Subsetting/aggregating dataframe to " + attr['allocation_from_scale'] + " geoscale") flows_subset_geo = subset_df_by_geoscale( flows_subset, v['geoscale_to_use'], attr['allocation_from_scale']) # Add sectors to df activity, depending on level of specified sector aggregation log.info("Adding sectors to " + k) flow_subset_wsec = add_sectors_to_flowbyactivity( flows_subset_geo, sectorsourcename=method['target_sector_source'], allocationmethod=attr['allocation_method']) # clean up fba with sectors, if specified in yaml if v["clean_fba_w_sec_df_fxn"] != 'None': log.info("Cleaning up " + k + " FlowByActivity with sectors") flow_subset_wsec = getattr(sys.modules[__name__], v["clean_fba_w_sec_df_fxn"])( flow_subset_wsec, attr=attr) # map df to elementary flows log.info("Mapping flows in " + k + ' to federal elementary flow list') if 'fedefl_mapping' in v: mapping_files = v['fedefl_mapping'] else: mapping_files = k flow_subset_mapped = map_elementary_flows( flow_subset_wsec, mapping_files) # clean up mapped fba with sectors, if specified in yaml if "clean_mapped_fba_w_sec_df_fxn" in v: log.info("Cleaning up " + k + " FlowByActivity with sectors") flow_subset_mapped = getattr( sys.modules[__name__], v["clean_mapped_fba_w_sec_df_fxn"])(flow_subset_mapped, attr, method) # if allocation method is "direct", then no need to create alloc ratios, else need to use allocation # dataframe to create sector allocation ratios if attr['allocation_method'] == 'direct': log.info('Directly assigning ' + ', '.join(map(str, names)) + ' to sectors') fbs = flow_subset_mapped.copy() # for each activity, if activities are not sector like, check that there is no data loss if load_source_catalog( )[k]['sector-like_activities'] is False: activity_list = [] for n in names: log.info('Checking for ' + n + ' at ' + method['target_sector_level']) fbs_subset = fbs[( (fbs[fba_activity_fields[0]] == n) & (fbs[fba_activity_fields[1]] == n)) | (fbs[fba_activity_fields[0]] == n) | (fbs[fba_activity_fields[1]] == n )].reset_index(drop=True) fbs_subset = check_if_losing_sector_data( fbs_subset, method['target_sector_level']) activity_list.append(fbs_subset) fbs = pd.concat(activity_list, ignore_index=True) # if allocation method for an activity set requires a specific function due to the complicated nature # of the allocation, call on function here elif attr['allocation_method'] == 'allocation_function': log.info( 'Calling on function specified in method yaml to allocate ' + ', '.join(map(str, names)) + ' to sectors') fbs = getattr(sys.modules[__name__], attr['allocation_source'])( flow_subset_mapped, attr, fbs_list) else: # determine appropriate allocation dataset log.info("Loading allocation flowbyactivity " + attr['allocation_source'] + " for year " + str(attr['allocation_source_year'])) fba_allocation = flowsa.getFlowByActivity( flowclass=[attr['allocation_source_class']], datasource=attr['allocation_source'], years=[attr['allocation_source_year'] ]).reset_index(drop=True) # clean df and harmonize unites fba_allocation = clean_df(fba_allocation, flow_by_activity_fields, fba_fill_na_dict) fba_allocation = harmonize_units(fba_allocation) # check if allocation data exists at specified geoscale to use log.info("Checking if allocation data exists at the " + attr['allocation_from_scale'] + " level") check_if_data_exists_at_geoscale( fba_allocation, attr['allocation_from_scale']) # aggregate geographically to the scale of the flowbyactivty source, if necessary fba_allocation = subset_df_by_geoscale( fba_allocation, attr['allocation_from_scale'], v['geoscale_to_use']) # subset based on yaml settings if attr['allocation_flow'] != 'None': fba_allocation = fba_allocation.loc[ fba_allocation['FlowName'].isin( attr['allocation_flow'])] if attr['allocation_compartment'] != 'None': fba_allocation = fba_allocation.loc[ fba_allocation['Compartment'].isin( attr['allocation_compartment'])] # cleanup the fba allocation df, if necessary if 'clean_allocation_fba' in attr: log.info("Cleaning " + attr['allocation_source']) fba_allocation = getattr(sys.modules[__name__], attr["clean_allocation_fba"])( fba_allocation, attr=attr) # reset index fba_allocation = fba_allocation.reset_index(drop=True) # assign sector to allocation dataset log.info("Adding sectors to " + attr['allocation_source']) fba_allocation_wsec = add_sectors_to_flowbyactivity( fba_allocation, sectorsourcename=method['target_sector_source']) # call on fxn to further clean up/disaggregate the fba allocation data, if exists if 'clean_allocation_fba_w_sec' in attr: log.info("Further disaggregating sectors in " + attr['allocation_source']) fba_allocation_wsec = getattr( sys.modules[__name__], attr["clean_allocation_fba_w_sec"])( fba_allocation_wsec, attr=attr, method=method) # subset fba datasets to only keep the sectors associated with activity subset log.info("Subsetting " + attr['allocation_source'] + " for sectors in " + k) fba_allocation_subset = get_fba_allocation_subset( fba_allocation_wsec, k, names, flowSubsetMapped=flow_subset_mapped, allocMethod=attr['allocation_method']) # if there is an allocation helper dataset, modify allocation df if attr['allocation_helper'] == 'yes': log.info( "Using the specified allocation help for subset of " + attr['allocation_source']) fba_allocation_subset = allocation_helper( fba_allocation_subset, attr, method, v) # create flow allocation ratios for each activity # if load_source_catalog()[k]['sector-like_activities'] flow_alloc_list = [] group_cols = fba_mapped_default_grouping_fields group_cols = [ e for e in group_cols if e not in ('ActivityProducedBy', 'ActivityConsumedBy') ] for n in names: log.info("Creating allocation ratios for " + n) fba_allocation_subset_2 = get_fba_allocation_subset( fba_allocation_subset, k, [n], flowSubsetMapped=flow_subset_mapped, allocMethod=attr['allocation_method']) if len(fba_allocation_subset_2) == 0: log.info("No data found to allocate " + n) else: flow_alloc = allocate_by_sector( fba_allocation_subset_2, k, attr['allocation_source'], attr['allocation_method'], group_cols, flowSubsetMapped=flow_subset_mapped) flow_alloc = flow_alloc.assign(FBA_Activity=n) flow_alloc_list.append(flow_alloc) flow_allocation = pd.concat(flow_alloc_list, ignore_index=True) # generalize activity field names to enable link to main fba source log.info("Generalizing activity columns in subset of " + attr['allocation_source']) flow_allocation = collapse_activity_fields(flow_allocation) # check for issues with allocation ratios check_allocation_ratios(flow_allocation, aset, k, method_name) # create list of sectors in the flow allocation df, drop any rows of data in the flow df that \ # aren't in list sector_list = flow_allocation['Sector'].unique().tolist() # subset fba allocation table to the values in the activity list, based on overlapping sectors flow_subset_mapped = flow_subset_mapped.loc[ (flow_subset_mapped[fbs_activity_fields[0]]. isin(sector_list)) | (flow_subset_mapped[fbs_activity_fields[1]]. isin(sector_list))] # check if fba and allocation dfs have the same LocationSystem log.info( "Checking if flowbyactivity and allocation dataframes use the same location systems" ) check_if_location_systems_match(flow_subset_mapped, flow_allocation) # merge fba df w/flow allocation dataset log.info("Merge " + k + " and subset of " + attr['allocation_source']) fbs = flow_subset_mapped.merge( flow_allocation[[ 'Location', 'Sector', 'FlowAmountRatio', 'FBA_Activity' ]], left_on=[ 'Location', 'SectorProducedBy', 'ActivityProducedBy' ], right_on=['Location', 'Sector', 'FBA_Activity'], how='left') fbs = fbs.merge( flow_allocation[[ 'Location', 'Sector', 'FlowAmountRatio', 'FBA_Activity' ]], left_on=[ 'Location', 'SectorConsumedBy', 'ActivityConsumedBy' ], right_on=['Location', 'Sector', 'FBA_Activity'], how='left') # merge the flowamount columns fbs.loc[:, 'FlowAmountRatio'] = fbs[ 'FlowAmountRatio_x'].fillna(fbs['FlowAmountRatio_y']) # fill null rows with 0 because no allocation info fbs['FlowAmountRatio'] = fbs['FlowAmountRatio'].fillna(0) # check if fba and alloc dfs have data for same geoscales - comment back in after address the 'todo' # log.info("Checking if flowbyactivity and allocation dataframes have data at the same locations") # check_if_data_exists_for_same_geoscales(fbs, k, attr['names']) # drop rows where there is no allocation data fbs = fbs.dropna(subset=['Sector_x', 'Sector_y'], how='all').reset_index() # calculate flow amounts for each sector log.info("Calculating new flow amounts using flow ratios") fbs.loc[:, 'FlowAmount'] = fbs['FlowAmount'] * fbs[ 'FlowAmountRatio'] # drop columns log.info("Cleaning up new flow by sector") fbs = fbs.drop(columns=[ 'Sector_x', 'FlowAmountRatio_x', 'Sector_y', 'FlowAmountRatio_y', 'FlowAmountRatio', 'FBA_Activity_x', 'FBA_Activity_y' ]) # drop rows where flowamount = 0 (although this includes dropping suppressed data) fbs = fbs[fbs['FlowAmount'] != 0].reset_index(drop=True) # define grouping columns dependent on sectors being activity-like or not if load_source_catalog()[k]['sector-like_activities'] is False: groupingcols = fbs_grouping_fields_w_activities groupingdict = flow_by_sector_fields_w_activity else: groupingcols = fbs_default_grouping_fields groupingdict = flow_by_sector_fields # clean df fbs = clean_df(fbs, groupingdict, fbs_fill_na_dict) # aggregate df geographically, if necessary # todo: replace with fxn return_from_scale log.info("Aggregating flowbysector to " + method['target_geoscale'] + " level") if fips_number_key[v['geoscale_to_use']] < fips_number_key[ attr['allocation_from_scale']]: from_scale = v['geoscale_to_use'] else: from_scale = attr['allocation_from_scale'] to_scale = method['target_geoscale'] fbs_geo_agg = agg_by_geoscale(fbs, from_scale, to_scale, groupingcols) # aggregate data to every sector level log.info("Aggregating flowbysector to all sector levels") fbs_sec_agg = sector_aggregation(fbs_geo_agg, groupingcols) # add missing naics5/6 when only one naics5/6 associated with a naics4 fbs_agg = sector_disaggregation(fbs_sec_agg, groupingdict) # check if any sector information is lost before reaching the target sector length, if so, # allocate values equally to disaggregated sectors log.info('Checking for data at ' + method['target_sector_level']) fbs_agg_2 = check_if_losing_sector_data( fbs_agg, method['target_sector_level']) # compare flowbysector with flowbyactivity # todo: modify fxn to work if activities are sector like in df being allocated if load_source_catalog()[k]['sector-like_activities'] is False: check_for_differences_between_fba_load_and_fbs_output( flow_subset_mapped, fbs_agg_2, aset, k, method_name) # return sector level specified in method yaml # load the crosswalk linking sector lengths sector_list = get_sector_list(method['target_sector_level']) # subset df, necessary because not all of the sectors are NAICS and can get duplicate rows fbs_1 = fbs_agg_2.loc[ (fbs_agg_2[fbs_activity_fields[0]].isin(sector_list)) & (fbs_agg_2[fbs_activity_fields[1]].isin(sector_list) )].reset_index(drop=True) fbs_2 = fbs_agg_2.loc[ (fbs_agg_2[fbs_activity_fields[0]].isin(sector_list)) & (fbs_agg_2[fbs_activity_fields[1]].isnull())].reset_index( drop=True) fbs_3 = fbs_agg_2.loc[ (fbs_agg_2[fbs_activity_fields[0]].isnull()) & (fbs_agg_2[fbs_activity_fields[1]].isin(sector_list) )].reset_index(drop=True) fbs_sector_subset = pd.concat([fbs_1, fbs_2, fbs_3]) # drop activity columns fbs_sector_subset = fbs_sector_subset.drop( ['ActivityProducedBy', 'ActivityConsumedBy'], axis=1, errors='ignore') # save comparison of FBA total to FBS total for an activity set compare_fba_load_and_fbs_output_totals(flows_subset_geo, fbs_sector_subset, aset, k, method_name, attr, method, mapping_files) log.info( "Completed flowbysector for activity subset with flows " + ', '.join(map(str, names))) fbs_list.append(fbs_sector_subset) else: # if the loaded flow dt is already in FBS format, append directly to list of FBS log.info("Append " + k + " to FBS list") # ensure correct field datatypes and add any missing fields flows = clean_df(flows, flow_by_sector_fields, fbs_fill_na_dict) fbs_list.append(flows) # create single df of all activities log.info("Concat data for all activities") fbss = pd.concat(fbs_list, ignore_index=True, sort=False) log.info("Clean final dataframe") # aggregate df as activities might have data for the same specified sector length fbss = clean_df(fbss, flow_by_sector_fields, fbs_fill_na_dict) fbss = aggregator(fbss, fbs_default_grouping_fields) # sort df log.info("Sort and store dataframe") # add missing fields, ensure correct data type, reorder columns fbss = fbss.sort_values( ['SectorProducedBy', 'SectorConsumedBy', 'Flowable', 'Context']).reset_index(drop=True) # save parquet file store_flowbysector(fbss, method_name)
def check_for_differences_between_fba_load_and_fbs_output( fba_load, fbs_load, activity_set, source_name): """ Function to compare the loaded flowbyactivity with the final flowbysector output, checking for data loss :param df: :return: """ # test # fba_load = flow_subset_mapped.copy() # fbs_load = fbs_agg.copy() # activity_set = aset # source_name = k # subset fba df fba = fba_load[[ 'Class', 'SourceName', 'Flowable', 'Unit', 'FlowType', 'ActivityProducedBy', 'ActivityConsumedBy', 'Context', 'Location', 'LocationSystem', 'Year', 'FlowAmount' ]].drop_duplicates().reset_index(drop=True) fba.loc[:, 'Location'] = US_FIPS group_cols = [ 'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit', 'FlowType', 'Context', 'Location', 'LocationSystem', 'Year' ] fba_agg = aggregator(fba, group_cols) fba_agg.rename(columns={'FlowAmount': 'FBA_amount'}, inplace=True) # subset fbs df fbs = fbs_load[[ 'Class', 'SectorSourceName', 'Flowable', 'Unit', 'FlowType', 'SectorProducedBy', 'SectorConsumedBy', 'ActivityProducedBy', 'ActivityConsumedBy', 'Context', 'Location', 'LocationSystem', 'Year', 'FlowAmount' ]].drop_duplicates().reset_index(drop=True) fbs['SectorProducedBy'] = fbs['SectorProducedBy'].replace({'nan': ''}) fbs['SectorConsumedBy'] = fbs['SectorConsumedBy'].replace({'nan': ''}) fbs['ActivityProducedBy'] = fbs['ActivityProducedBy'].replace( {'nan': None}) fbs['ActivityConsumedBy'] = fbs['ActivityConsumedBy'].replace( {'nan': None}) fbs['ProducedLength'] = fbs['SectorProducedBy'].apply(lambda x: len(x)) fbs['ConsumedLength'] = fbs['SectorConsumedBy'].apply(lambda x: len(x)) fbs['SectorLength'] = fbs[['ProducedLength', 'ConsumedLength']].max(axis=1) fbs.loc[:, 'Location'] = US_FIPS group_cols = [ 'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit', 'FlowType', 'Context', 'Location', 'LocationSystem', 'Year', 'SectorLength' ] fbs_agg = aggregator(fbs, group_cols) fbs_agg.rename(columns={'FlowAmount': 'FBS_amount'}, inplace=True) # merge compare 1 and compare 2 comparison = fba_agg.merge(fbs_agg, left_on=[ 'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit', 'FlowType', 'Context', 'Location', 'LocationSystem', 'Year' ], right_on=[ 'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit', 'FlowType', 'Context', 'Location', 'LocationSystem', 'Year' ], how='left') comparison['Ratio'] = comparison['FBS_amount'] / comparison['FBA_amount'] # reorder comparison = comparison[[ 'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit', 'FlowType', 'Context', 'Location', 'LocationSystem', 'Year', 'SectorLength', 'FBA_amount', 'FBS_amount', 'Ratio' ]] # todo: address the duplicated rows/data that occur for non-naics household sector length ua_count1 = len(comparison[comparison['Ratio'] < 0.95]) log.info( 'There are ' + str(ua_count1) + ' combinations of flowable/context/sector length where the flowbyactivity to flowbysector ratio is < 0.95' ) ua_count2 = len(comparison[comparison['Ratio'] < 0.99]) log.info( 'There are ' + str(ua_count2) + ' combinations of flowable/context/sector length where the flowbyactivity to flowbysector ratio is < 0.99' ) oa_count1 = len(comparison[comparison['Ratio'] > 1]) log.info( 'There are ' + str(oa_count1) + ' combinations of flowable/context/sector length where the flowbyactivity to flowbysector ratio is > 1.0' ) oa_count2 = len(comparison[comparison['Ratio'] > 1.01]) log.info( 'There are ' + str(oa_count2) + ' combinations of flowable/context/sector length where the flowbyactivity to flowbysector ratio is > 1.01' ) # save csv to output folder log.info( 'Save the comparision of FlowByActivity load to FlowBySector ratios for ' + activity_set + ' in output folder') comparison.to_csv(outputpath + "FlowBySectorMethodAnalysis/" + source_name + "_FBA_load_to_FBS_comparision_" + activity_set + ".csv", index=False) return None
def main(**kwargs): """ Creates a flowbysector dataset :param kwargs: dictionary of arguments, only argument is "method_name", the name of method corresponding to flowbysector method yaml name :return: parquet, FBS save to local folder """ if len(kwargs) == 0: kwargs = parse_args() method_name = kwargs['method'] download_FBA_if_missing = kwargs.get('download_FBAs_if_missing') # assign arguments vLog.info("Initiating flowbysector creation for %s", method_name) # call on method method = load_yaml_dict(method_name, flowbytype='FBS') # create dictionary of data and allocation datasets fb = method['source_names'] # Create empty list for storing fbs files fbs_list = [] for k, v in fb.items(): # pull fba data for allocation flows = load_source_dataframe(k, v, download_FBA_if_missing) if v['data_format'] == 'FBA': # ensure correct datatypes and that all fields exist flows = clean_df(flows, flow_by_activity_fields, fba_fill_na_dict, drop_description=False) # clean up fba before mapping, if specified in yaml if "clean_fba_before_mapping_df_fxn" in v: vLog.info("Cleaning up %s FlowByActivity", k) flows = dynamically_import_fxn( k, v["clean_fba_before_mapping_df_fxn"])(flows) # map flows to federal flow list or material flow list flows_mapped, mapping_files = \ map_fbs_flows(flows, k, v, keep_fba_columns=True) # clean up fba, if specified in yaml if "clean_fba_df_fxn" in v: vLog.info("Cleaning up %s FlowByActivity", k) flows_mapped = dynamically_import_fxn( k, v["clean_fba_df_fxn"])(flows_mapped) # if activity_sets are specified in a file, call them here if 'activity_set_file' in v: aset_names = pd.read_csv(flowbysectoractivitysetspath + v['activity_set_file'], dtype=str) else: aset_names = None # master list of activity names read in from data source ml_act = [] # create dictionary of allocation datasets for different activities activities = v['activity_sets'] # subset activity data and allocate to sector for aset, attr in activities.items(): # subset by named activities if 'activity_set_file' in v: names = \ aset_names[aset_names['activity_set'] == aset]['name'] else: names = attr['names'] # to avoid double counting data from the same source, in # the event there are values in both the APB and ACB # columns, if an activity has already been read in and # allocated, remove that activity from the mapped flows # regardless of what activity set the data was read in flows_mapped = flows_mapped[~( (flows_mapped[fba_activity_fields[0]].isin(ml_act)) | (flows_mapped[fba_activity_fields[1]].isin(ml_act)) )].reset_index(drop=True) ml_act.extend(names) vLog.info("Preparing to handle %s in %s", aset, k) # subset fba data by activity flows_subset = flows_mapped[ (flows_mapped[fba_activity_fields[0]].isin(names)) | (flows_mapped[fba_activity_fields[1]].isin(names) )].reset_index(drop=True) # subset by flowname if exists if 'source_flows' in attr: flows_subset = flows_subset[flows_subset['FlowName'].isin( attr['source_flows'])] if len(flows_subset) == 0: log.warning(f"no data found for flows in {aset}") continue if len(flows_subset[flows_subset['FlowAmount'] != 0]) == 0: log.warning(f"all flow data for {aset} is 0") continue # if activities are sector-like, check sectors are valid if check_activities_sector_like(k): flows_subset2 = replace_naics_w_naics_from_another_year( flows_subset, method['target_sector_source']) # check impact on df FlowAmounts vLog.info( 'Calculate FlowAmount difference caused by ' 'replacing NAICS Codes with %s, saving ' 'difference in Validation log', method['target_sector_source'], ) calculate_flowamount_diff_between_dfs( flows_subset, flows_subset2) else: flows_subset2 = flows_subset.copy() # extract relevant geoscale data or aggregate existing data flows_subset_geo = subset_df_by_geoscale( flows_subset2, v['geoscale_to_use'], attr['allocation_from_scale']) # if loading data subnational geoscale, check for data loss if attr['allocation_from_scale'] != 'national': compare_geographic_totals(flows_subset_geo, flows_mapped, k, attr, aset, names) # Add sectors to df activity, depending on level # of specified sector aggregation log.info("Adding sectors to %s", k) flows_subset_wsec = add_sectors_to_flowbyactivity( flows_subset_geo, sectorsourcename=method['target_sector_source'], allocationmethod=attr['allocation_method']) # clean up fba with sectors, if specified in yaml if "clean_fba_w_sec_df_fxn" in v: vLog.info("Cleaning up %s FlowByActivity with sectors", k) flows_subset_wsec = dynamically_import_fxn( k, v["clean_fba_w_sec_df_fxn"])(flows_subset_wsec, attr=attr, method=method) # rename SourceName to MetaSources and drop columns flows_mapped_wsec = flows_subset_wsec.\ rename(columns={'SourceName': 'MetaSources'}).\ drop(columns=['FlowName', 'Compartment']) # if allocation method is "direct", then no need # to create alloc ratios, else need to use allocation # dataframe to create sector allocation ratios if attr['allocation_method'] == 'direct': fbs = direct_allocation_method(flows_mapped_wsec, k, names, method) # if allocation method for an activity set requires a specific # function due to the complicated nature # of the allocation, call on function here elif attr['allocation_method'] == 'allocation_function': fbs = function_allocation_method(flows_mapped_wsec, k, names, attr, fbs_list) else: fbs = dataset_allocation_method(flows_mapped_wsec, attr, names, method, k, v, aset, aset_names, download_FBA_if_missing) # drop rows where flowamount = 0 # (although this includes dropping suppressed data) fbs = fbs[fbs['FlowAmount'] != 0].reset_index(drop=True) # define grouping columns dependent on sectors # being activity-like or not if check_activities_sector_like(k) is False: groupingcols = fbs_grouping_fields_w_activities groupingdict = flow_by_sector_fields_w_activity else: groupingcols = fbs_default_grouping_fields groupingdict = flow_by_sector_fields # clean df fbs = clean_df(fbs, groupingdict, fbs_fill_na_dict) # aggregate df geographically, if necessary log.info("Aggregating flowbysector to %s level", method['target_geoscale']) # determine from scale if fips_number_key[v['geoscale_to_use']] <\ fips_number_key[attr['allocation_from_scale']]: from_scale = v['geoscale_to_use'] else: from_scale = attr['allocation_from_scale'] fbs_geo_agg = agg_by_geoscale(fbs, from_scale, method['target_geoscale'], groupingcols) # aggregate data to every sector level log.info("Aggregating flowbysector to all sector levels") fbs_sec_agg = sector_aggregation(fbs_geo_agg, groupingcols) # add missing naics5/6 when only one naics5/6 # associated with a naics4 fbs_agg = sector_disaggregation(fbs_sec_agg) # check if any sector information is lost before reaching # the target sector length, if so, # allocate values equally to disaggregated sectors vLog.info( 'Searching for and allocating FlowAmounts for any parent ' 'NAICS that were dropped in the subset to ' '%s child NAICS', method['target_sector_level']) fbs_agg_2 = equally_allocate_parent_to_child_naics( fbs_agg, method['target_sector_level']) # compare flowbysector with flowbyactivity compare_activity_to_sector_flowamounts(flows_mapped_wsec, fbs_agg_2, aset, k, method) # return sector level specified in method yaml # load the crosswalk linking sector lengths sector_list = get_sector_list(method['target_sector_level']) # subset df, necessary because not all of the sectors are # NAICS and can get duplicate rows fbs_1 = fbs_agg_2.loc[ (fbs_agg_2[fbs_activity_fields[0]].isin(sector_list)) & (fbs_agg_2[fbs_activity_fields[1]].isin(sector_list))].\ reset_index(drop=True) fbs_2 = fbs_agg_2.loc[ (fbs_agg_2[fbs_activity_fields[0]].isin(sector_list)) & (fbs_agg_2[fbs_activity_fields[1]].isnull())].\ reset_index(drop=True) fbs_3 = fbs_agg_2.loc[ (fbs_agg_2[fbs_activity_fields[0]].isnull()) & (fbs_agg_2[fbs_activity_fields[1]].isin(sector_list))].\ reset_index(drop=True) fbs_sector_subset = pd.concat([fbs_1, fbs_2, fbs_3]) # drop activity columns fbs_sector_subset = fbs_sector_subset.drop( ['ActivityProducedBy', 'ActivityConsumedBy'], axis=1, errors='ignore') # save comparison of FBA total to FBS total for an activity set compare_fba_geo_subset_and_fbs_output_totals( flows_subset_geo, fbs_sector_subset, aset, k, v, attr, method) log.info("Completed flowbysector for %s", aset) fbs_list.append(fbs_sector_subset) else: if 'clean_fbs_df_fxn' in v: flows = dynamically_import_fxn(v["clean_fbs_df_fxn_source"], v["clean_fbs_df_fxn"])(flows) flows = update_geoscale(flows, method['target_geoscale']) # if the loaded flow dt is already in FBS format, # append directly to list of FBS log.info("Append %s to FBS list", k) # ensure correct field datatypes and add any missing fields flows = clean_df(flows, flow_by_sector_fields, fbs_fill_na_dict) fbs_list.append(flows) # create single df of all activities log.info("Concat data for all activities") fbss = pd.concat(fbs_list, ignore_index=True, sort=False) log.info("Clean final dataframe") # add missing fields, ensure correct data type, # add missing columns, reorder columns fbss = clean_df(fbss, flow_by_sector_fields, fbs_fill_na_dict) # prior to aggregating, replace MetaSources string with all sources # that share context/flowable/sector values fbss = harmonize_FBS_columns(fbss) # aggregate df as activities might have data for # the same specified sector length fbss = aggregator(fbss, fbs_default_grouping_fields) # sort df log.info("Sort and store dataframe") # ensure correct data types/order of columns fbss = clean_df(fbss, flow_by_sector_fields, fbs_fill_na_dict) fbss = fbss.sort_values( ['SectorProducedBy', 'SectorConsumedBy', 'Flowable', 'Context']).reset_index(drop=True) # check for negative flow amounts check_for_negative_flowamounts(fbss) # tmp reset data quality scores fbss = reset_fbs_dq_scores(fbss) # save parquet file meta = set_fb_meta(method_name, "FlowBySector") write_df_to_file(fbss, paths, meta) write_metadata(method_name, method, meta, "FlowBySector") # rename the log file saved to local directory rename_log_file(method_name, meta) log.info( 'See the Validation log for detailed assessment of ' 'model results in %s', logoutputpath)
def compare_fba_load_and_fbs_output_totals(fba_load, fbs_load, activity_set, source_name, method_name, attr, method, mapping_files): """ Function to compare the loaded flowbyactivity total with the final flowbysector output total :param df: :return: """ from flowsa.flowbyfunctions import subset_df_by_geoscale, sector_aggregation from flowsa.common import load_source_catalog from flowsa.mapping import map_elementary_flows log.info( 'Comparing loaded FlowByActivity FlowAmount total to subset FlowBySector FlowAmount total' ) # load source catalog cat = load_source_catalog() src_info = cat[source_name] # extract relevant geoscale data or aggregate existing data fba = subset_df_by_geoscale(fba_load, attr['allocation_from_scale'], method['target_geoscale']) # map loaded fba fba = map_elementary_flows(fba, mapping_files, keep_unmapped_rows=True) if src_info['sector-like_activities']: # if activities are sector-like, run sector aggregation and then subset df to only keep NAICS2 fba = fba[[ 'Class', 'FlowAmount', 'Unit', 'Context', 'ActivityProducedBy', 'ActivityConsumedBy', 'Location', 'LocationSystem' ]] # rename the activity cols to sector cols for purposes of aggregation fba = fba.rename( columns={ 'ActivityProducedBy': 'SectorProducedBy', 'ActivityConsumedBy': 'SectorConsumedBy' }) group_cols_agg = [ 'Class', 'Context', 'Unit', 'Location', 'LocationSystem', 'SectorProducedBy', 'SectorConsumedBy' ] fba = sector_aggregation(fba, group_cols_agg) # subset fba to only include NAICS2 fba = replace_NoneType_with_empty_cells(fba) fba = fba[fba['SectorConsumedBy'].apply(lambda x: len(x) == 2) | fba['SectorProducedBy'].apply(lambda x: len(x) == 2)] # subset/agg dfs col_subset = [ 'Class', 'FlowAmount', 'Unit', 'Context', 'Location', 'LocationSystem' ] group_cols = ['Class', 'Unit', 'Context', 'Location', 'LocationSystem'] # fba fba = fba[col_subset] fba_agg = aggregator(fba, group_cols).reset_index(drop=True) fba_agg.rename(columns={ 'FlowAmount': 'FBA_amount', 'Unit': 'FBA_unit' }, inplace=True) # fbs fbs = fbs_load[col_subset] fbs_agg = aggregator(fbs, group_cols) fbs_agg.rename(columns={ 'FlowAmount': 'FBS_amount', 'Unit': 'FBS_unit' }, inplace=True) try: # merge FBA and FBS totals df_merge = fba_agg.merge(fbs_agg, how='left') df_merge['FlowAmount_difference'] = df_merge['FBA_amount'] - df_merge[ 'FBS_amount'] df_merge['Percent_difference'] = (df_merge['FlowAmount_difference'] / df_merge['FBA_amount']) * 100 # reorder df_merge = df_merge[[ 'Class', 'Context', 'Location', 'LocationSystem', 'FBA_amount', 'FBA_unit', 'FBS_amount', 'FBS_unit', 'FlowAmount_difference', 'Percent_difference' ]] df_merge = replace_NoneType_with_empty_cells(df_merge) # list of contexts context_list = df_merge['Context'].to_list() # loop through the contexts and print results of comparison for i in context_list: df_merge_subset = df_merge[df_merge['Context'] == i].reset_index( drop=True) diff_per = df_merge_subset['Percent_difference'][0] # make reporting more manageable if abs(diff_per) > 0.001: diff_per = round(diff_per, 2) else: diff_per = round(diff_per, 6) diff_units = df_merge_subset['FBS_unit'][0] if diff_per > 0: log.info('The total FlowBySector FlowAmount for ' + source_name + ' ' + activity_set + ' ' + i + ' is ' + str(abs(diff_per)) + '% less than the total FlowByActivity FlowAmount') else: log.info('The total FlowBySector FlowAmount for ' + source_name + ' ' + activity_set + ' ' + i + ' is ' + str(abs(diff_per)) + '% more than the total FlowByActivity FlowAmount') # save csv to output folder log.info( 'Save the comparison of FlowByActivity load to FlowBySector total FlowAmounts for ' + activity_set + ' in output folder') # output data at all sector lengths df_merge.to_csv(outputpath + "FlowBySectorMethodAnalysis/" + method_name + '_' + source_name + "_FBA_total_to_FBS_total_FlowAmount_comparison_" + activity_set + ".csv", index=False) except: log.info( 'Error occured when comparing total FlowAmounts for FlowByActivity and FlowBySector' ) return None
def main(method_name): """ Creates a flowbysector dataset :param method_name: Name of method corresponding to flowbysector method yaml name :return: flowbysector """ log.info("Initiating flowbysector creation for " + method_name) # call on method method = load_method(method_name) # create dictionary of data and allocation datasets fb = method['source_names'] # Create empty list for storing fbs files fbss = [] for k, v in fb.items(): # pull fba data for allocation flows = load_source_dataframe(k, v) if v['data_format'] == 'FBA': # clean up fba, if specified in yaml if v["clean_fba_df_fxn"] != 'None': log.info("Cleaning up " + k + " FlowByActivity") flows = getattr(sys.modules[__name__], v["clean_fba_df_fxn"])(flows) flows = clean_df(flows, flow_by_activity_fields, fba_fill_na_dict) # create dictionary of allocation datasets for different activities activities = v['activity_sets'] # subset activity data and allocate to sector for aset, attr in activities.items(): # subset by named activities names = attr['names'] log.info("Preparing to handle subset of flownames " + ', '.join(map(str, names)) + " in " + k) # check if flowbyactivity data exists at specified geoscale to use flow_subset_list = [] for n in names: # subset usgs data by activity flow_subset = flows[(flows[fba_activity_fields[0]] == n) | (flows[fba_activity_fields[1]] == n)].reset_index(drop=True) log.info("Checking if flowbyactivity data exists for " + n + " at the " + v['geoscale_to_use'] + ' level') geocheck = check_if_data_exists_at_geoscale(flow_subset, v['geoscale_to_use'], activitynames=n) # aggregate geographically to the scale of the allocation dataset if geocheck == "Yes": activity_from_scale = v['geoscale_to_use'] else: # if activity does not exist at specified geoscale, issue warning and use data at less aggregated # geoscale, and sum to specified geoscale log.info("Checking if flowbyactivity data exists for " + n + " at a less aggregated level") activity_from_scale = check_if_data_exists_at_less_aggregated_geoscale(flow_subset, v['geoscale_to_use'], n) activity_to_scale = attr['allocation_from_scale'] # if df is less aggregated than allocation df, aggregate usgs activity to allocation geoscale if fips_number_key[activity_from_scale] > fips_number_key[activity_to_scale]: log.info("Aggregating subset from " + activity_from_scale + " to " + activity_to_scale) flow_subset = agg_by_geoscale(flow_subset, activity_from_scale, activity_to_scale, fba_default_grouping_fields, n) # else, aggregate to geoscale want to use elif fips_number_key[activity_from_scale] > fips_number_key[v['geoscale_to_use']]: log.info("Aggregating subset from " + activity_from_scale + " to " + v['geoscale_to_use']) flow_subset = agg_by_geoscale(flow_subset, activity_from_scale, v['geoscale_to_use'], fba_default_grouping_fields, n) # else, if usgs is more aggregated than allocation table, filter relevant rows else: log.info("Subsetting " + activity_from_scale + " data") flow_subset = filter_by_geoscale(flow_subset, activity_from_scale, n) # Add sectors to df activity, depending on level of specified sector aggregation log.info("Adding sectors to " + k + " for " + n) flow_subset_wsec = add_sectors_to_flowbyactivity(flow_subset, sectorsourcename=method['target_sector_source'], levelofSectoragg=attr['activity_sector_aggregation']) flow_subset_list.append(flow_subset_wsec) flow_subset_wsec = pd.concat(flow_subset_list, sort=False).reset_index(drop=True) # clean up fba with sectors, if specified in yaml if v["clean_fba_w_sec_df_fxn"] != 'None': log.info("Cleaning up " + k + " FlowByActivity with sectors") flow_subset_wsec = getattr(sys.modules[__name__], v["clean_fba_w_sec_df_fxn"])(flow_subset_wsec, attr) # map df to elementary flows - commented out until mapping complete log.info("Mapping flows in " + k + ' to federal elementary flow list') flow_subset_wsec = map_elementary_flows(flow_subset_wsec, k) # if allocation method is "direct", then no need to create alloc ratios, else need to use allocation # dataframe to create sector allocation ratios if attr['allocation_method'] == 'direct': log.info('Directly assigning ' + ', '.join(map(str, names)) + ' to sectors') fbs = flow_subset_wsec.copy() else: # determine appropriate allocation dataset log.info("Loading allocation flowbyactivity " + attr['allocation_source'] + " for year " + str(attr['allocation_source_year'])) fba_allocation = flowsa.getFlowByActivity(flowclass=[attr['allocation_source_class']], datasource=attr['allocation_source'], years=[attr['allocation_source_year']]).reset_index(drop=True) fba_allocation = clean_df(fba_allocation, flow_by_activity_fields, fba_fill_na_dict) # subset based on yaml settings if attr['allocation_flow'] != 'None': fba_allocation = fba_allocation.loc[fba_allocation['FlowName'].isin(attr['allocation_flow'])] if attr['allocation_compartment'] != 'None': fba_allocation = fba_allocation.loc[ fba_allocation['Compartment'].isin(attr['allocation_compartment'])] # cleanup the fba allocation df, if necessary if 'clean_allocation_fba' in attr: log.info("Cleaning " + attr['allocation_source']) fba_allocation = getattr(sys.modules[__name__], attr["clean_allocation_fba"])(fba_allocation) # reset index fba_allocation = fba_allocation.reset_index(drop=True) # check if allocation data exists at specified geoscale to use log.info("Checking if allocation data exists at the " + attr['allocation_from_scale'] + " level") check_if_data_exists_at_geoscale(fba_allocation, attr['allocation_from_scale']) # aggregate geographically to the scale of the flowbyactivty source, if necessary from_scale = attr['allocation_from_scale'] to_scale = v['geoscale_to_use'] # if allocation df is less aggregated than FBA df, aggregate allocation df to target scale if fips_number_key[from_scale] > fips_number_key[to_scale]: fba_allocation = agg_by_geoscale(fba_allocation, from_scale, to_scale, fba_default_grouping_fields, names) # else, if usgs is more aggregated than allocation table, use usgs as both to and from scale else: fba_allocation = filter_by_geoscale(fba_allocation, from_scale, names) # assign sector to allocation dataset log.info("Adding sectors to " + attr['allocation_source']) fba_allocation = add_sectors_to_flowbyactivity(fba_allocation, sectorsourcename=method['target_sector_source'], levelofSectoragg=attr['allocation_sector_aggregation']) # subset fba datsets to only keep the sectors associated with activity subset log.info("Subsetting " + attr['allocation_source'] + " for sectors in " + k) fba_allocation_subset = get_fba_allocation_subset(fba_allocation, k, names) # generalize activity field names to enable link to main fba source log.info("Generalizing activity columns in subset of " + attr['allocation_source']) fba_allocation_subset = generalize_activity_field_names(fba_allocation_subset) # drop columns fba_allocation_subset = fba_allocation_subset.drop(columns=['Activity']) # call on fxn to further disaggregate the fba allocation data, if exists if 'allocation_disaggregation_fxn' in attr: log.info("Futher disaggregating sectors in " + attr['allocation_source']) fba_allocation_subset = getattr(sys.modules[__name__], attr["allocation_disaggregation_fxn"])(fba_allocation_subset, attr) # if there is an allocation helper dataset, modify allocation df if attr['allocation_helper'] == 'yes': log.info("Using the specified allocation help for subset of " + attr['allocation_source']) fba_allocation_subset = allocation_helper(fba_allocation_subset, method, attr) # create flow allocation ratios log.info("Creating allocation ratios for " + attr['allocation_source']) flow_allocation = allocate_by_sector(fba_allocation_subset, attr['allocation_method']) # create list of sectors in the flow allocation df, drop any rows of data in the flow df that \ # aren't in list sector_list = flow_allocation['Sector'].unique().tolist() # subset fba allocation table to the values in the activity list, based on overlapping sectors flow_subset_wsec = flow_subset_wsec.loc[ (flow_subset_wsec[fbs_activity_fields[0]].isin(sector_list)) | (flow_subset_wsec[fbs_activity_fields[1]].isin(sector_list))] # check if fba and allocation dfs have the same LocationSystem log.info("Checking if flowbyactivity and allocation dataframes use the same location systems") check_if_location_systems_match(flow_subset_wsec, flow_allocation) # merge fba df w/flow allocation dataset log.info("Merge " + k + " and subset of " + attr['allocation_source']) fbs = flow_subset_wsec.merge( flow_allocation[['Location', 'Sector', 'FlowAmountRatio']], left_on=['Location', 'SectorProducedBy'], right_on=['Location', 'Sector'], how='left') fbs = fbs.merge( flow_allocation[['Location', 'Sector', 'FlowAmountRatio']], left_on=['Location', 'SectorConsumedBy'], right_on=['Location', 'Sector'], how='left') # merge the flowamount columns fbs.loc[:, 'FlowAmountRatio'] = fbs['FlowAmountRatio_x'].fillna(fbs['FlowAmountRatio_y']) # check if fba and alloc dfs have data for same geoscales - comment back in after address the 'todo' # log.info("Checking if flowbyactivity and allocation dataframes have data at the same locations") # check_if_data_exists_for_same_geoscales(fbs, k, attr['names']) # drop rows where there is no allocation data fbs = fbs.dropna(subset=['Sector_x', 'Sector_y'], how='all').reset_index() # calculate flow amounts for each sector log.info("Calculating new flow amounts using flow ratios") fbs.loc[:, 'FlowAmount'] = fbs['FlowAmount'] * fbs['FlowAmountRatio'] # drop columns log.info("Cleaning up new flow by sector") fbs = fbs.drop(columns=['Sector_x', 'FlowAmountRatio_x', 'Sector_y', 'FlowAmountRatio_y', 'FlowAmountRatio', 'ActivityProducedBy', 'ActivityConsumedBy']) # drop rows where flowamount = 0 (although this includes dropping suppressed data) fbs = fbs[fbs['FlowAmount'] != 0].reset_index(drop=True) # clean df fbs = clean_df(fbs, flow_by_sector_fields, fbs_fill_na_dict) # aggregate df geographically, if necessary log.info("Aggregating flowbysector to " + method['target_geoscale'] + " level") if fips_number_key[v['geoscale_to_use']] < fips_number_key[attr['allocation_from_scale']]: from_scale = v['geoscale_to_use'] else: from_scale = attr['allocation_from_scale'] to_scale = method['target_geoscale'] fbs = agg_by_geoscale(fbs, from_scale, to_scale, fbs_default_grouping_fields, names) # aggregate data to every sector level log.info("Aggregating flowbysector to all sector levels") fbs = sector_aggregation(fbs, fbs_default_grouping_fields) # add missing naics5/6 when only one naics5/6 associated with a naics4 fbs = sector_disaggregation(fbs) # test agg by sector # sector_agg_comparison = sector_flow_comparision(fbs) # return sector level specified in method yaml # load the crosswalk linking sector lengths sector_list = get_sector_list(method['target_sector_level']) # add any non-NAICS sectors used with NAICS sector_list = add_non_naics_sectors(sector_list, method['target_sector_level']) # subset df, necessary because not all of the sectors are NAICS and can get duplicate rows fbs_1 = fbs.loc[(fbs[fbs_activity_fields[0]].isin(sector_list)) & (fbs[fbs_activity_fields[1]].isin(sector_list))].reset_index(drop=True) fbs_2 = fbs.loc[(fbs[fbs_activity_fields[0]].isin(sector_list)) | (fbs[fbs_activity_fields[1]].isin(sector_list))].reset_index(drop=True) fbs_sector_subset = pd.concat([fbs_1, fbs_2], sort=False) # set source name fbs_sector_subset.loc[:, 'SectorSourceName'] = method['target_sector_source'] log.info("Completed flowbysector for activity subset with flows " + ', '.join(map(str, names))) fbss.append(fbs_sector_subset) else: # if the loaded flow dt is already in FBS format, append directly to list of FBS log.info("Append " + k + " to FBS list") fbss.append(flows) # create single df of all activities log.info("Concat data for all activities") fbss = pd.concat(fbss, ignore_index=True, sort=False) log.info("Clean final dataframe") # aggregate df as activities might have data for the same specified sector length fbss = aggregator(fbss, fbs_default_grouping_fields) # sort df log.info("Sort and store dataframe") fbss = fbss.replace({'nan': None}) # add missing fields, ensure correct data type, reorder columns fbss = clean_df(fbss, flow_by_sector_fields, fbs_fill_na_dict) fbss = fbss.sort_values( ['SectorProducedBy', 'SectorConsumedBy', 'Flowable', 'Context']).reset_index(drop=True) # save parquet file store_flowbysector(fbss, method_name)
def convert_statcan_data_to_US_water_use(df, attr): """ Use Canadian GDP data to convert 3 digit canadian water use to us water use: - canadian gdp - us gdp :return: """ import flowsa from flowsa.values_from_literature import get_Canadian_to_USD_exchange_rate from flowsa.flowbyfunctions import assign_fips_location_system, aggregator, fba_default_grouping_fields from flowsa.common import US_FIPS, load_bea_crosswalk # load Canadian GDP data gdp = flowsa.getFlowByActivity(flowclass=['Money'], datasource='StatCan_GDP', years=[attr['allocation_source_year']]) # drop 31-33 gdp = gdp[gdp['ActivityProducedBy'] != '31-33'] gdp = gdp.rename(columns={"FlowAmount": "CanDollar"}) # merge df df_m = pd.merge(df, gdp[['CanDollar', 'ActivityProducedBy']], how='left', left_on='ActivityConsumedBy', right_on='ActivityProducedBy') df_m['CanDollar'] = df_m['CanDollar'].fillna(0) df_m = df_m.drop(columns=["ActivityProducedBy_y"]) df_m = df_m.rename(columns={"ActivityProducedBy_x": "ActivityProducedBy"}) df_m = df_m[df_m['CanDollar'] != 0] exchange_rate = get_Canadian_to_USD_exchange_rate( str(attr['allocation_source_year'])) exchange_rate = float(exchange_rate) # convert to mgal/USD df_m.loc[:, 'FlowAmount'] = df_m['FlowAmount'] / (df_m['CanDollar'] / exchange_rate) df_m.loc[:, 'Unit'] = 'Mgal/USD' df_m = df_m.drop(columns=["CanDollar"]) # convert Location to US df_m.loc[:, 'Location'] = US_FIPS df_m = assign_fips_location_system(df_m, str(attr['allocation_source_year'])) # load us gdp # load Canadian GDP data us_gdp_load = flowsa.getFlowByActivity( flowclass=['Money'], datasource='BEA_GDP_GrossOutput_IO', years=[attr['allocation_source_year']]) # load bea crosswalk cw_load = load_bea_crosswalk() cw = cw_load[['BEA_2012_Detail_Code', 'NAICS_2012_Code']].drop_duplicates() cw = cw[cw['NAICS_2012_Code'].apply( lambda x: len(str(x)) == 3)].drop_duplicates().reset_index(drop=True) # merge us_gdp = pd.merge(us_gdp_load, cw, how='left', left_on='ActivityProducedBy', right_on='BEA_2012_Detail_Code') us_gdp = us_gdp.drop( columns=['ActivityProducedBy', 'BEA_2012_Detail_Code']) # rename columns us_gdp = us_gdp.rename(columns={'NAICS_2012_Code': 'ActivityProducedBy'}) # agg by naics us_gdp = aggregator(us_gdp, fba_default_grouping_fields) us_gdp = us_gdp.rename(columns={'FlowAmount': 'us_gdp'}) # determine annual us water use df_m2 = pd.merge(df_m, us_gdp[['ActivityProducedBy', 'us_gdp']], how='left', left_on='ActivityConsumedBy', right_on='ActivityProducedBy') df_m2.loc[:, 'FlowAmount'] = df_m2['FlowAmount'] * (df_m2['us_gdp']) df_m2.loc[:, 'Unit'] = 'Mgal' df_m2 = df_m2.rename( columns={'ActivityProducedBy_x': 'ActivityProducedBy'}) df_m2 = df_m2.drop(columns=['ActivityProducedBy_y', 'us_gdp']) return df_m2
def main(method_name): """ Creates a flowbysector dataset :param method_name: Name of method corresponding to flowbysector method yaml name :return: flowbysector """ log.info("Initiating flowbysector creation for " + method_name) # call on method method = load_method(method_name) # create dictionary of water data and allocation datasets fbas = method['flowbyactivity_sources'] # Create empty list for storing fbs files fbss = [] for k, v in fbas.items(): # pull water data for allocation log.info("Retrieving flowbyactivity for datasource " + k + " in year " + str(v['year'])) flows = flowsa.getFlowByActivity(flowclass=[v['class']], years=[v['year']], datasource=k) # if necessary, standardize names in data set if v['activity_name_standardization_fxn'] != 'None': log.info("Standardizing activity names in " + k) flows = getattr(sys.modules[__name__], v['activity_name_standardization_fxn'])(flows) # drop description field flows = flows.drop(columns='Description') # fill null values flows = flows.fillna(value=fba_fill_na_dict) # map df to elementary flows - commented out until mapping complete # log.info("Mapping flows in " + k + ' to federal elementary flow list') # flows_mapped = map_elementary_flows(flows, k) # convert unit todo: think about unit conversion here log.info("Converting units in " + k) flows = convert_unit(flows) # create dictionary of allocation datasets for different activities activities = v['activity_sets'] for aset, attr in activities.items(): # subset by named activities names = [attr['names']] log.info("Preparing to handle subset of flownames " + ', '.join(map(str, names)) + " in " + k) # subset usgs data by activity flow_subset = flows[(flows[fba_activity_fields[0]].isin(names)) | (flows[fba_activity_fields[1]].isin(names))] # Reset index values after subset flow_subset = flow_subset.reset_index(drop=True) # check if flowbyactivity data exists at specified geoscale to use log.info("Checking if flowbyactivity data exists for " + ', '.join(map(str, names)) + " at the " + v['geoscale_to_use'] + ' level') geocheck = check_if_data_exists_at_geoscale(flow_subset, names, v['geoscale_to_use']) # aggregate geographically to the scale of the allocation dataset if geocheck == "Yes": activity_from_scale = v['geoscale_to_use'] else: # if activity does not exist at specified geoscale, issue warning and use data at less aggregated # geoscale, and sum to specified geoscale log.info("Checking if flowbyactivity data exists for " + ', '.join(map(str, names)) + " at a less aggregated level") new_geoscale_to_use = check_if_data_exists_at_less_aggregated_geoscale(flow_subset, names, v['geoscale_to_use']) activity_from_scale = new_geoscale_to_use activity_to_scale = attr['allocation_from_scale'] # if usgs is less aggregated than allocation df, aggregate usgs activity to target scale if fips_number_key[activity_from_scale] > fips_number_key[activity_to_scale]: log.info("Aggregating subset from " + activity_from_scale + " to " + activity_to_scale) flow_subset = agg_by_geoscale(flow_subset, activity_from_scale, activity_to_scale, fba_default_grouping_fields, names) # else, aggregate to geoscale want to use elif fips_number_key[activity_from_scale] > fips_number_key[v['geoscale_to_use']]: log.info("Aggregating subset from " + activity_from_scale + " to " + v['geoscale_to_use']) flow_subset = agg_by_geoscale(flow_subset, activity_from_scale, v['geoscale_to_use'], fba_default_grouping_fields, names) # else, if usgs is more aggregated than allocation table, filter relevant rows else: log.info("Filtering out " + activity_from_scale + " data") flow_subset = filter_by_geoscale(flow_subset, activity_from_scale, names) # location column pad zeros if necessary flow_subset['Location'] = flow_subset['Location'].apply(lambda x: x.ljust(3 + len(x), '0') if len(x) < 5 else x ) # Add sectors to usgs activity, creating two versions of the flow subset # the first version "flow_subset" is the most disaggregated version of the Sectors (NAICS) # the second version, "flow_subset_agg" includes only the most aggregated level of sectors log.info("Adding sectors to " + k + " for " + ', '.join(map(str, names))) flow_subset_wsec = add_sectors_to_flowbyactivity(flow_subset, sectorsourcename=method['target_sector_source']) flow_subset_wsec_agg = add_sectors_to_flowbyactivity(flow_subset, sectorsourcename=method['target_sector_source'], levelofSectoragg='agg') # if allocation method is "direct", then no need to create alloc ratios, else need to use allocation # dataframe to create sector allocation ratios if attr['allocation_method'] == 'direct': fbs = flow_subset_wsec_agg.copy() else: # determine appropriate allocation dataset log.info("Loading allocation flowbyactivity " + attr['allocation_source'] + " for year " + str(attr['allocation_source_year'])) fba_allocation = flowsa.getFlowByActivity(flowclass=[attr['allocation_source_class']], datasource=attr['allocation_source'], years=[attr['allocation_source_year']]).reset_index(drop=True) # fill null values fba_allocation = fba_allocation.fillna(value=fba_fill_na_dict) # convert unit fba_allocation = convert_unit(fba_allocation) # subset based on yaml settings if attr['allocation_flow'] != 'None': fba_allocation = fba_allocation.loc[fba_allocation['FlowName'].isin(attr['allocation_flow'])] if attr['allocation_compartment'] != 'None': fba_allocation = fba_allocation.loc[ fba_allocation['Compartment'].isin(attr['allocation_compartment'])] # reset index fba_allocation = fba_allocation.reset_index(drop=True) # check if allocation data exists at specified geoscale to use log.info("Checking if" + " allocation data exists for " + ', '.join(map(str, names)) + " at the " + attr['allocation_from_scale'] + " level") check_if_data_exists_at_geoscale(fba_allocation, names, attr['allocation_from_scale']) # aggregate geographically to the scale of the flowbyactivty source, if necessary from_scale = attr['allocation_from_scale'] to_scale = v['geoscale_to_use'] # if allocation df is less aggregated than FBA df, aggregate allocation df to target scale if fips_number_key[from_scale] > fips_number_key[to_scale]: fba_allocation = agg_by_geoscale(fba_allocation, from_scale, to_scale, fba_default_grouping_fields, names) # else, if usgs is more aggregated than allocation table, use usgs as both to and from scale else: fba_allocation = filter_by_geoscale(fba_allocation, from_scale, names) # assign sector to allocation dataset log.info("Adding sectors to " + attr['allocation_source']) fba_allocation = add_sectors_to_flowbyactivity(fba_allocation, sectorsourcename=method['target_sector_source'], levelofSectoragg=attr[ 'allocation_sector_aggregation']) # subset fba datsets to only keep the naics associated with usgs activity subset log.info("Subsetting " + attr['allocation_source'] + " for sectors in " + k) fba_allocation_subset = get_fba_allocation_subset(fba_allocation, k, names) # Reset index values after subset fba_allocation_subset = fba_allocation_subset.reset_index(drop=True) # generalize activity field names to enable link to water withdrawal table log.info("Generalizing activity names in subset of " + attr['allocation_source']) fba_allocation_subset = generalize_activity_field_names(fba_allocation_subset) # drop columns fba_allocation_subset = fba_allocation_subset.drop(columns=['Activity']) # if there is an allocation helper dataset, modify allocation df if attr['allocation_helper'] == 'yes': log.info("Using the specified allocation help for subset of " + attr['allocation_source']) fba_allocation_subset = allocation_helper(fba_allocation_subset, method, attr) # create flow allocation ratios log.info("Creating allocation ratios for " + attr['allocation_source']) flow_allocation = allocate_by_sector(fba_allocation_subset, attr['allocation_method']) # create list of sectors in the flow allocation df, drop any rows of data in the flow df that \ # aren't in list sector_list = flow_allocation['Sector'].unique().tolist() # subset fba allocation table to the values in the activity list, based on overlapping sectors flow_subset_wsec = flow_subset_wsec.loc[ (flow_subset_wsec[fbs_activity_fields[0]].isin(sector_list)) | (flow_subset_wsec[fbs_activity_fields[1]].isin(sector_list))] # check if fba and allocation dfs have the same LocationSystem log.info("Checking if flowbyactivity and allocation dataframes use the same location systems") check_if_location_systems_match(flow_subset_wsec, flow_allocation) # merge water withdrawal df w/flow allocation dataset log.info("Merge " + k + " and subset of " + attr['allocation_source']) fbs = flow_subset_wsec.merge( flow_allocation[['Location', 'LocationSystem', 'Sector', 'FlowAmountRatio']], left_on=['Location', 'LocationSystem', 'SectorProducedBy'], right_on=['Location', 'LocationSystem', 'Sector'], how='left') fbs = fbs.merge( flow_allocation[['Location', 'LocationSystem', 'Sector', 'FlowAmountRatio']], left_on=['Location', 'LocationSystem', 'SectorConsumedBy'], right_on=['Location', 'LocationSystem', 'Sector'], how='left') # drop columns where both sector produced/consumed by in flow allocation dif is null fbs = fbs.dropna(subset=['Sector_x', 'Sector_y'], how='all').reset_index() # merge the flowamount columns fbs['FlowAmountRatio'] = fbs['FlowAmountRatio_x'].fillna(fbs['FlowAmountRatio_y']) fbs['FlowAmountRatio'] = fbs['FlowAmountRatio'].fillna(0) # calculate flow amounts for each sector log.info("Calculating new flow amounts using flow ratios") fbs['FlowAmount'] = fbs['FlowAmount'] * fbs['FlowAmountRatio'] # drop columns log.info("Cleaning up new flow by sector") fbs = fbs.drop(columns=['Sector_x', 'FlowAmountRatio_x', 'Sector_y', 'FlowAmountRatio_y', 'FlowAmountRatio', 'ActivityProducedBy', 'ActivityConsumedBy']) # rename flow name to flowable fbs = fbs.rename(columns={"FlowName": 'Flowable', "Compartment": "Context" }) # drop rows where flowamount = 0 (although this includes dropping suppressed data) fbs = fbs[fbs['FlowAmount'] != 0].reset_index(drop=True) # add missing data columns fbs = add_missing_flow_by_fields(fbs, flow_by_sector_fields) # fill null values fbs = fbs.fillna(value=fbs_fill_na_dict) # aggregate df geographically, if necessary log.info("Aggregating flowbysector to " + method['target_geoscale'] + " level") if fips_number_key[v['geoscale_to_use']] < fips_number_key[attr['allocation_from_scale']]: from_scale = v['geoscale_to_use'] else: from_scale = attr['allocation_from_scale'] to_scale = method['target_geoscale'] fbs = agg_by_geoscale(fbs, from_scale, to_scale, fbs_default_grouping_fields, names) # aggregate data to every sector level log.info("Aggregating flowbysector to " + method['target_sector_level']) fbs = sector_aggregation(fbs, fbs_default_grouping_fields) # test agg by sector sector_agg_comparison = sector_flow_comparision(fbs) # return sector level specified in method yaml # load the crosswalk linking sector lengths cw = load_sector_length_crosswalk() sector_list = cw[method['target_sector_level']].unique().tolist() # add any non-NAICS sectors used with NAICS household = load_household_sector_codes() household = household.loc[household['NAICS_Level_to_Use_For'] == method['target_sector_level']] # add household sector to sector list sector_list.extend(household['Code'].tolist()) # subset df fbs = fbs.loc[(fbs[fbs_activity_fields[0]].isin(sector_list)) | (fbs[fbs_activity_fields[1]].isin(sector_list))].reset_index(drop=True) # add any missing columns of data and cast to appropriate data type fbs = add_missing_flow_by_fields(fbs, flow_by_sector_fields) log.info("Completed flowbysector for activity subset with flows " + ', '.join(map(str, names))) fbss.append(fbs) # create single df of all activities fbss = pd.concat(fbss, ignore_index=True, sort=False) # aggregate df as activities might have data for the same specified sector length fbss = aggregator(fbss, fbs_default_grouping_fields) # sort df fbss = fbss.sort_values( ['SectorProducedBy', 'SectorConsumedBy', 'Flowable', 'Context']).reset_index(drop=True) # save parquet file store_flowbysector(fbss, method_name)
def geoscale_flow_comparison(flowclass, years, datasource, activitynames=['all'], to_scale='national'): """ Aggregates county data to state and national, and state data to national level, allowing for comparisons in flow totals for a given flowclass and industry. First assigns all flownames to NAICS and standardizes units. Assigned to NAICS rather than using FlowNames for aggregation to negate any changes in flownames across time/geoscale """ # load parquet file checking aggregation flows = flowsa.getFlowByActivity(flowclass=flowclass, years=years, datasource=datasource) # fill null values flows = flows.fillna(value=fba_fill_na_dict) # convert units flows = convert_unit(flows) # if activityname set to default, then compare aggregation for all activities. If looking at particular activity, # filter that activity out if activitynames == ['all']: flow_subset = flows.copy() else: flow_subset = flows[ (flows[fba_activity_fields[0]].isin(activitynames)) | (flows[fba_activity_fields[1]].isin(activitynames))] # Reset index values after subset flow_subset = flow_subset.reset_index() # pull naics crosswalk mapping = get_activitytosector_mapping(flow_subset['SourceName'].all()) # assign naics to activities # usgs datasource is not easily assigned to naics for checking totals, so instead standardize activity names if datasource == 'USGS_NWIS_WU': flow_subset = standardize_usgs_nwis_names(flow_subset) else: flow_subset = pd.merge(flow_subset, mapping[['Activity', 'Sector']], left_on='ActivityProducedBy', right_on='Activity', how='left').rename( {'Sector': 'SectorProducedBy'}, axis=1) flow_subset = pd.merge(flow_subset, mapping[['Activity', 'Sector']], left_on='ActivityConsumedBy', right_on='Activity', how='left').rename( {'Sector': 'SectorConsumedBy'}, axis=1) flow_subset = flow_subset.drop(columns=[ 'ActivityProducedBy', 'ActivityConsumedBy', 'Activity_x', 'Activity_y', 'Description' ], errors='ignore') flow_subset['SectorProducedBy'] = flow_subset['SectorProducedBy'].replace({ np.nan: None }).astype(str) flow_subset['SectorConsumedBy'] = flow_subset['SectorConsumedBy'].replace({ np.nan: None }).astype(str) # create list of geoscales for aggregation if to_scale == 'national': geoscales = ['national', 'state', 'county'] elif to_scale == 'state': geoscales = ['state', 'county'] # create empty df list flow_dfs = [] for i in geoscales: try: # filter by geoscale fba_from_scale = filter_by_geoscale(flow_subset, i) # remove/add column names as a column group_cols = fba_default_grouping_fields.copy() for j in ['Location', 'ActivityProducedBy', 'ActivityConsumedBy']: group_cols.remove(j) for j in ['SectorProducedBy', 'SectorConsumedBy']: group_cols.append(j) # county sums to state and national, state sums to national if to_scale == 'state': fba_from_scale['Location'] = fba_from_scale['Location'].apply( lambda x: str(x[0:2])) elif to_scale == 'national': fba_from_scale['Location'] = US_FIPS # aggregate fba_agg = aggregator(fba_from_scale, group_cols) # rename flowamount column, based on geoscale fba_agg = fba_agg.rename(columns={"FlowAmount": "FlowAmount_" + i}) # drop fields irrelevant to aggregated flow comparision drop_fields = flows[[ 'MeasureofSpread', 'Spread', 'DistributionType', 'DataReliability', 'DataCollection' ]] fba_agg = fba_agg.drop(columns=drop_fields) # reset index fba_agg = fba_agg.reset_index(drop=True) flow_dfs.append(fba_agg) except: pass # merge list of dfs by column flow_comparison = reduce( lambda left, right: pd.merge( left, right, on=[ 'Class', 'SourceName', 'FlowName', 'Unit', 'SectorProducedBy', 'SectorConsumedBy', 'Compartment', 'Location', 'LocationSystem', 'Year' ], how='outer'), flow_dfs) # sort df flow_comparison = flow_comparison.sort_values([ 'Year', 'Location', 'SectorProducedBy', 'SectorConsumedBy', 'FlowName', 'Compartment' ]) return flow_comparison