def load_source_dataframe(k, v): """ Load the source dataframe. Data can be a FlowbyActivity or FlowBySector parquet stored in flowsa, or a FlowBySector formatted dataframe from another package. :param k: The datasource name :param v: The datasource parameters :return: """ if v['data_format'] == 'FBA': # if yaml specifies a geoscale to load, use parameter to filter dataframe if 'source_fba_load_scale' in v: geo_level = v['source_fba_load_scale'] else: geo_level = 'all' log.info("Retrieving flowbyactivity for datasource " + k + " in year " + str(v['year'])) flows_df = flowsa.getFlowByActivity(flowclass=[v['class']], years=[v['year']], datasource=k, geographic_level=geo_level) elif v['data_format'] == 'FBS': log.info("Retrieving flowbysector for datasource " + k) flows_df = flowsa.getFlowBySector(k) elif v['data_format'] == 'FBS_outside_flowsa': log.info("Retrieving flowbysector for datasource " + k) flows_df = getattr(sys.modules[__name__], v["FBS_datapull_fxn"])(*v['parameters']) else: log.error("Data format not specified in method file for datasource " + k) return flows_df
def load_source_dataframe(k, v): """ Load the source dataframe. Data can be a FlowbyActivity or FlowBySector parquet stored in flowsa, or a FlowBySector formatted dataframe from another package. :param k: str, The datasource name :param v: dictionary, The datasource parameters :return: df of identified parquet """ if v['data_format'] == 'FBA': # if yaml specifies a geoscale to load, use parameter to filter dataframe if 'source_fba_load_scale' in v: geo_level = v['source_fba_load_scale'] else: geo_level = None vLog.info("Retrieving flowbyactivity for datasource %s in year %s", k, str(v['year'])) flows_df = flowsa.getFlowByActivity(datasource=k, year=v['year'], flowclass=v['class'], geographic_level=geo_level) elif v['data_format'] == 'FBS': vLog.info("Retrieving flowbysector for datasource %s", k) flows_df = flowsa.getFlowBySector(k) elif v['data_format'] == 'FBS_outside_flowsa': vLog.info("Retrieving flowbysector for datasource %s", k) flows_df = dynamically_import_fxn(k, v["FBS_datapull_fxn"])(v) else: vLog.error( "Data format not specified in method file for datasource %s", k) return flows_df
def load_fba_w_standardized_units(datasource, year, **kwargs): """ Standardize how a FBA is loaded for allocation purposes when generating a FBS. Important to immediately convert the df units to standardized units. :param datasource: string, FBA source name :param year: int, year of data :param kwargs: optional parameters include flowclass, geographic_level, and download_if_missing :return: fba df with standardized units """ # determine if any addtional parameters required to load a Flow-By-Activity # add parameters to dictionary if exist in method yaml fba_dict = {} if 'flowclass' in kwargs: fba_dict['flowclass'] = kwargs['flowclass'] if 'geographic_level' in kwargs: fba_dict['geographic_level'] = kwargs['geographic_level'] # load the allocation FBA fba = flowsa.getFlowByActivity(datasource, year, **fba_dict).reset_index(drop=True) # ensure df loaded correctly/has correct dtypes fba = clean_df(fba, flow_by_activity_fields, fba_fill_na_dict) # convert to standardized units fba = standardize_units(fba) return fba
def convert_blackhurst_data_to_gal_per_year(df, attr): import flowsa from flowsa.mapping import add_sectors_to_flowbyactivity from flowsa.flowbyfunctions import clean_df, fba_fill_na_dict # load the bea make table bmt = flowsa.getFlowByActivity(flowclass=['Money'], datasource='BEA_Make_Table', years=[2002]) # clean df bmt = clean_df(bmt, flow_by_activity_fields, fba_fill_na_dict) # drop rows with flowamount = 0 bmt = bmt[bmt['FlowAmount'] != 0] bh_df_revised = pd.merge( df, bmt[['FlowAmount', 'ActivityProducedBy', 'Location']], left_on=['ActivityConsumedBy', 'Location'], right_on=['ActivityProducedBy', 'Location']) bh_df_revised.loc[:, 'FlowAmount'] = ((bh_df_revised['FlowAmount_x']) * (bh_df_revised['FlowAmount_y'])) bh_df_revised.loc[:, 'Unit'] = 'gal' # drop columns bh_df_revised = bh_df_revised.drop( columns=["FlowAmount_x", "FlowAmount_y", 'ActivityProducedBy_y']) bh_df_revised = bh_df_revised.rename( columns={"ActivityProducedBy_x": "ActivityProducedBy"}) return bh_df_revised
def disaggregate_pastureland(fba_w_sector, attr): """ The USDA CoA Cropland irrigated pastureland data only links to the 3 digit NAICS '112'. This function uses state level CoA 'Land in Farms' to allocate the county level acreage data to 6 digit NAICS. :param fba_w_sector: The CoA Cropland dataframe after linked to sectors :return: The CoA cropland dataframe with disaggregated pastureland data """ import flowsa from flowsa.flowbyfunctions import allocate_by_sector, clean_df, flow_by_activity_fields, \ fba_fill_na_dict # subset the coa data so only pastureland p = fba_w_sector.loc[fba_w_sector['Sector'] == '112'] # add temp loc column for state fips p.loc[:, 'Location_tmp'] = p['Location'].apply(lambda x: str(x[0:2])) # load usda coa cropland naics df_f = flowsa.getFlowByActivity(flowclass=['Land'], years=[attr['allocation_source_year']], datasource='USDA_CoA_Cropland_NAICS') df_f = clean_df(df_f, flow_by_activity_fields, fba_fill_na_dict) # subset to land in farms data df_f = df_f[df_f['FlowName'] == 'FARM OPERATIONS'] # subset to rows related to pastureland df_f = df_f.loc[df_f['ActivityConsumedBy'].apply(lambda x: str(x[0:3])) == '112'] # drop rows with "&' df_f = df_f[~df_f['ActivityConsumedBy'].str.contains('&')] # create sector column df_f.loc[:, 'Sector'] = df_f['ActivityConsumedBy'] # create proportional ratios df_f = allocate_by_sector(df_f, 'proportional') # drop naics = '11 df_f = df_f[df_f['Sector'] != '11'] # drop 000 in location df_f.loc[:, 'Location'] = df_f['Location'].apply(lambda x: str(x[0:2])) # merge the coa pastureland data with land in farm data df = p.merge(df_f[['Sector', 'Location', 'FlowAmountRatio']], how='left', left_on="Location_tmp", right_on="Location") # multiply the flowamount by the flowratio df.loc[:, 'FlowAmount'] = df['FlowAmount'] * df['FlowAmountRatio'] # drop columns and rename df = df.drop( columns=['Location_tmp', 'Sector_x', 'Location_y', 'FlowAmountRatio']) df = df.rename(columns={"Sector_y": "Sector", "Location_x": 'Location'}) # drop rows where sector = 112 and then concat with original fba_w_sector fba_w_sector = fba_w_sector[fba_w_sector['Sector'].apply( lambda x: str(x[0:3])) != '112'].reset_index(drop=True) fba_w_sector = pd.concat([fba_w_sector, df], sort=False).reset_index(drop=True) return fba_w_sector
def allocation_helper(df_w_sector, method, attr): """ Used when two df required to create allocation ratio :param df_w_sector: :param method: currently written for 'multiplication' :param attr: :return: """ helper_allocation = flowsa.getFlowByActivity( flowclass=[attr['helper_source_class']], datasource=attr['helper_source'], years=[attr['helper_source_year']]) # fill null values helper_allocation = helper_allocation.fillna(value=fba_fill_na_dict) # convert unit helper_allocation = convert_unit(helper_allocation) # assign naics to allocation dataset helper_allocation = add_sectors_to_flowbyactivity( helper_allocation, sectorsourcename=method['target_sector_source'], levelofSectoragg=attr['helper_sector_aggregation']) # generalize activity field names to enable link to water withdrawal table helper_allocation = generalize_activity_field_names(helper_allocation) # drop columns helper_allocation = helper_allocation.drop( columns=['Activity', 'Description', 'Min', 'Max']) # rename column helper_allocation = helper_allocation.rename( columns={"FlowAmount": 'HelperFlow'}) # merge allocation df with helper df based on sectors, depending on geo scales of dfs if attr['helper_from_scale'] == 'national': modified_fba_allocation = df_w_sector.merge( helper_allocation[['Sector', 'HelperFlow']], how='left') if (attr['helper_from_scale'] == 'state') and (attr['allocation_from_scale'] == 'county'): helper_allocation['Location_tmp'] = helper_allocation[ 'Location'].apply(lambda x: str(x[0:2])) df_w_sector['Location_tmp'] = df_w_sector['Location'].apply( lambda x: str(x[0:2])) modified_fba_allocation = df_w_sector.merge( helper_allocation[['Sector', 'Location_tmp', 'HelperFlow']], how='left') modified_fba_allocation = modified_fba_allocation.drop( columns=['Location_tmp']) # modify flow amounts using helper data if attr['helper_method'] == 'multiplication': modified_fba_allocation['FlowAmount'] = modified_fba_allocation[ 'FlowAmount'] * modified_fba_allocation['HelperFlow'] # drop columns modified_fba_allocation = modified_fba_allocation.drop( columns="HelperFlow") return modified_fba_allocation
def convert_blackhurst_data_to_gal_per_employee(df_wsec, attr, method): import flowsa from flowsa.mapping import add_sectors_to_flowbyactivity from flowsa.flowbyfunctions import clean_df, fba_fill_na_dict, agg_by_geoscale, fba_default_grouping_fields, \ sector_ratios, proportional_allocation_by_location_and_sector, filter_by_geoscale from flowsa.BLS_QCEW import clean_bls_qcew_fba bls = flowsa.getFlowByActivity(flowclass=['Employment'], datasource='BLS_QCEW', years=[2002]) # clean df bls = clean_df(bls, flow_by_activity_fields, fba_fill_na_dict) bls = clean_bls_qcew_fba(bls, attr) # bls_agg = agg_by_geoscale(bls, 'state', 'national', fba_default_grouping_fields) bls_agg = filter_by_geoscale(bls, 'national') # assign naics to allocation dataset bls_wsec = add_sectors_to_flowbyactivity( bls_agg, sectorsourcename=method['target_sector_source']) # drop rows where sector = None ( does not occur with mining) bls_wsec = bls_wsec[~bls_wsec['SectorProducedBy'].isnull()] bls_wsec = bls_wsec.rename(columns={'SectorProducedBy': 'Sector'}) # create list of sectors in the flow allocation df, drop any rows of data in the flow df that \ # aren't in list sector_list = df_wsec['Sector'].unique().tolist() # subset fba allocation table to the values in the activity list, based on overlapping sectors bls_wsec = bls_wsec.loc[bls_wsec['Sector'].isin(sector_list)] # calculate proportional ratios bls_wsec = proportional_allocation_by_location_and_sector( bls_wsec, 'Sector') #, 'agg') bls_wsec = bls_wsec.rename(columns={ 'FlowAmountRatio': 'EmployeeRatio', 'FlowAmount': 'Employees' }) # merge the two dfs df = pd.merge(df_wsec, bls_wsec[['Sector', 'EmployeeRatio', 'Employees']], how='left', left_on='Sector', right_on='Sector') df['EmployeeRatio'] = df['EmployeeRatio'].fillna(0) # calculate gal/employee in 2002 df.loc[:, 'FlowAmount'] = (df['FlowAmount'] * df['EmployeeRatio']) / df['Employees'] df.loc[:, 'Unit'] = 'gal/employee' # drop cols df = df.drop(columns=['Employees', 'EmployeeRatio']) return df
def load_fba_w_standardized_units(datasource, year, **kwargs): """ Standardize how a FBA is loaded for allocation purposes when generating a FBS. Important to immediately convert the df units to standardized units. :param datasource: string, FBA source name :param year: int, year of data :param kwargs: optional parameters include flowclass, geographic_level, download_if_missing, allocation_map_to_flow_list :return: fba df with standardized units """ from flowsa.sectormapping import map_fbs_flows # determine if any addtional parameters required to load a Flow-By-Activity # add parameters to dictionary if exist in method yaml fba_dict = {} if 'flowclass' in kwargs: fba_dict['flowclass'] = kwargs['flowclass'] if 'geographic_level' in kwargs: fba_dict['geographic_level'] = kwargs['geographic_level'] if 'download_FBA_if_missing' in kwargs: fba_dict['download_FBA_if_missing'] = kwargs['download_FBA_if_missing'] # load the allocation FBA fba = flowsa.getFlowByActivity(datasource, year, **fba_dict).reset_index(drop=True) # convert to standardized units either by mapping to federal # flow list/material flow list or by using function. Mapping will add # context and flowable columns if 'allocation_map_to_flow_list' in kwargs: if kwargs['allocation_map_to_flow_list']: # ensure df loaded correctly/has correct dtypes fba = clean_df(fba, flow_by_activity_fields, fba_fill_na_dict, drop_description=False) fba, mapping_files = map_fbs_flows(fba, datasource, kwargs, keep_fba_columns=True, keep_unmapped_rows=True) else: # ensure df loaded correctly/has correct dtypes fba = clean_df(fba, flow_by_activity_fields, fba_fill_na_dict) fba = standardize_units(fba) else: # ensure df loaded correctly/has correct dtypes fba = clean_df(fba, flow_by_activity_fields, fba_fill_na_dict) fba = standardize_units(fba) return fba
def assign_nonpoint_dqi(args): ''' Compares facility coverage data between NEI point and Census to estimate facility coverage in NEI nonpoint ''' import stewi import flowsa nei_facility_list = stewi.getInventoryFacilities('NEI', args['year']) nei_count = nei_facility_list.groupby('NAICS')['FacilityID'].count() census = flowsa.getFlowByActivity(flowclass=['Other'], years=[args['year']], datasource="Census_CBP") census = census[census['FlowName'] == 'Number of establishments'] census_count = census.groupby('ActivityProducedBy')['FlowAmount'].sum()
def scale_blackhurst_results_to_usgs_values(df_to_scale, attr): """ Scale the initial estimates for Blackhurst-based mining estimates to USGS values. Oil-based sectors are allocated a larger percentage of the difference between initial water withdrawal estimates and published USGS values. This method is based off the Water Satellite Table created by Yang and Ingwersen, 2017 :param df_to_scale: :param attr: :return: """ import flowsa from flowsa.flowbyfunctions import harmonize_units # determine national level published withdrawal data for usgs mining in FBS method year pv_load = flowsa.getFlowByActivity(flowclass=['Water'], years=[str(attr['helper_source_year'])], datasource="USGS_NWIS_WU") pv_load = harmonize_units(pv_load) pv_sub = pv_load[(pv_load['Location'] == str(US_FIPS)) & ( pv_load['ActivityConsumedBy'] == 'Mining')].reset_index(drop=True) pv = pv_sub['FlowAmount'].loc[ 0] * 1000000 # usgs unit is Mgal, blackhurst unit is gal # sum quantity of water withdrawals already allocated to sectors av = df_to_scale['FlowAmount'].sum() # calculate the difference between published value and allocated value vd = pv - av # subset df to scale into oil and non-oil sectors df_to_scale['sector_label'] = np.where( df_to_scale['SectorConsumedBy'].apply(lambda x: x[0:5] == '21111'), 'oil', 'nonoil') df_to_scale['ratio'] = np.where(df_to_scale['sector_label'] == 'oil', 2 / 3, 1 / 3) df_to_scale['label_sum'] = df_to_scale.groupby( ['Location', 'sector_label'])['FlowAmount'].transform('sum') df_to_scale.loc[:, 'value_difference'] = vd.astype(float) # calculate revised water withdrawal allocation df_scaled = df_to_scale.copy() df_scaled.loc[:, 'FlowAmount'] = df_scaled['FlowAmount'] + \ (df_scaled['FlowAmount'] / df_scaled['label_sum']) * \ (df_scaled['ratio'] * df_scaled['value_difference']) df_scaled = df_scaled.drop( columns=['sector_label', 'ratio', 'label_sum', 'value_difference']) return df_scaled
def get_fba_subset(name, year, flowclass): test_fba = flowsa.getFlowByActivity(flowclass=[flowclass], years=[year],datasource=name) if subset_activities: aset_names = pd.read_csv(flowbysectoractivitysetspath+asets_source,dtype=str) asets = [ #'activity_set_1', #'activity_set_2', #'activity_set_3', #'activity_set_4', 'activity_set_5', 'activity_set_6', ] activities = aset_names[aset_names['activity_set'].isin(asets)]['name'] test_fba = test_fba[test_fba['ActivityProducedBy'].isin(activities)] return test_fba
def unique_activity_names(flowclass, years, datasource): """read in the ers parquet files, select the unique activity names, return df with one column """ # create single df representing all selected years df = flowsa.getFlowByActivity(flowclass, years, datasource) column_activities = df[["ActivityConsumedBy", "ActivityProducedBy"]].values.ravel() unique_activities = pd.unique(column_activities) df_unique = unique_activities.reshape((-1, 1)) df_unique = pd.DataFrame({'Activity': df_unique[:, 0]}) df_unique = df_unique.loc[df_unique['Activity'].notnull()] df_unique = df_unique.loc[df_unique['Activity'] != 'None'] df_unique = df_unique.assign(ActivitySourceName=datasource) # sort df df_unique = df_unique.sort_values(['Activity']).reset_index(drop=True) return df_unique
def load_source_dataframe(k, v): """ Load the source dataframe. Data can be a FlowbyActivity or FlowBySector parquet stored in flowsa, or a FlowBySector formatted dataframe from another package. :param k: The datasource name :param v: The datasource parameters :return: """ if v['data_format'] == 'FBA': log.info("Retrieving flowbyactivity for datasource " + k + " in year " + str(v['year'])) flows_df = flowsa.getFlowByActivity(flowclass=[v['class']], years=[v['year']], datasource=k) elif v['data_format'] == 'FBS': log.info("Retrieving flowbysector for datasource " + k) flows_df = flowsa.getFlowBySector(k) elif v['data_format'] == 'FBS_outside_flowsa': log.info("Retrieving flowbysector for datasource " + k) flows_df = getattr(sys.modules[__name__], v["FBS_datapull_fxn"])(v['parameters']) else: log.error("No parquet file found for datasource " + k) return flows_df
def load_source_dataframe(sourcename, source_dict, download_FBA_if_missing): """ Load the source dataframe. Data can be a FlowbyActivity or FlowBySector parquet stored in flowsa, or a FlowBySector formatted dataframe from another package. :param sourcename: str, The datasource name :param source_dict: dictionary, The datasource parameters :param download_FBA_if_missing: Bool, if True will download FBAs from Data Commons. Default is False. :return: df of identified parquet """ if source_dict['data_format'] == 'FBA': # if yaml specifies a geoscale to load, use parameter # to filter dataframe if 'source_fba_load_scale' in source_dict: geo_level = source_dict['source_fba_load_scale'] else: geo_level = None vLog.info("Retrieving Flow-By-Activity for datasource %s in year %s", sourcename, str(source_dict['year'])) flows_df = flowsa.getFlowByActivity( datasource=sourcename, year=source_dict['year'], flowclass=source_dict['class'], geographic_level=geo_level, download_FBA_if_missing=download_FBA_if_missing) elif source_dict['data_format'] == 'FBS': vLog.info("Retrieving flowbysector for datasource %s", sourcename) flows_df = flowsa.getFlowBySector(sourcename) elif source_dict['data_format'] == 'FBS_outside_flowsa': vLog.info("Retrieving flowbysector for datasource %s", sourcename) flows_df = dynamically_import_fxn( sourcename, source_dict["FBS_datapull_fxn"])(source_dict) else: vLog.error( "Data format not specified in method " "file for datasource %s", sourcename) return flows_df
import flowsa from flowsa.common import flowbysectoractivitysetspath # as_year = '2010' as_year = '2014' if __name__ == '__main__': # define mecs land fba parameters land_flowclass = ['Land'] land_years = [as_year] datasource = 'EIA_MECS_Land' # Read BLM PLS crosswalk df_import = flowsa.getFlowByActivity(land_flowclass, land_years, datasource) # drop unused columns df = df_import[['ActivityConsumedBy']].drop_duplicates() df = df[~df['ActivityConsumedBy'].str.contains('-')] # rename columns df = df.rename(columns={"ActivityConsumedBy": "name"}) # assign column values df = df.assign(activity_set='activity_set_1') df = df.assign(note='') # reorder dataframe df = df[['activity_set', 'name', 'note']] df = df.sort_values(['activity_set', 'name']).reset_index(drop=True)
See source_catalog.yaml for available FlowByActivity datasets and available parameters for getFlowByActivity() Examples of use of flowsa. Read parquet files as dataframes. :param flowclass: list, a list of`Class' of the flow. required. E.g. ['Water'] or ['Land', 'Other'] :param year: list, a list of years [2015], or [2010,2011,2012] :param datasource: str, the code of the datasource. :param geographic_level: default set to 'all', which will load all geographic scales in the FlowByActivity, can specify 'national', 'state', 'county' :return: a pandas DataFrame in FlowByActivity format """ import flowsa from flowsa.common import fbaoutputpath # single flowclass, year, datasource, geographic_level default = 'all', file_location default = 'local' usda_cropland_fba_2017 = flowsa.getFlowByActivity( flowclass=['Land'], years=[2017], datasource="USDA_CoA_Cropland") # load file from remote server instead of local directory usda_cropland_fba_2017_remote = flowsa.getFlowByActivity( flowclass=['Land'], years=[2017], datasource="USDA_CoA_Cropland", file_location='remote') # multiple flowclass usda_iwms_fba_2013 = flowsa.getFlowByActivity(flowclass=['Land', 'Water'], years=[2013], datasource="USDA_IWMS") # only load state level data and save as csv # set parameters fc = ['Water']
:param year: int, a year, e.g. 2012 :param flowclass: str, a 'Class' of the flow. Optional. E.g. 'Water' :param geographic_level: str, a geographic level of the data. Optional. E.g. 'national', 'state', 'county'. :return: a pandas DataFrame in FlowByActivity format """ import flowsa from flowsa.common import fbaoutputpath # see available FBA models flowsa.seeAvailableFlowByModels('FBA') # Load all information for USDA Cropland usda_cropland_fba_2017 = flowsa.getFlowByActivity(datasource="USDA_CoA_Cropland", year=2017) # only load state level data and save as csv # set parameters ds = "USGS_NWIS_WU" year_fba = 2015 fc = 'Water' geo_level_fba = 'state' # load FBA usgs_water_fba_2015 =\ flowsa.getFlowByActivity(datasource=ds, year=year_fba, flowclass=fc, geographic_level=geo_level_fba).reset_index(drop=True) # save output to csv usgs_water_fba_2015.Location =\ usgs_water_fba_2015.Location.apply('="{}"'.format) # maintain leading 0s in location col usgs_water_fba_2015.to_csv(fbaoutputpath + ds + "_" + str(year_fba) + ".csv", index=False)
def main(method_name): """ Creates a flowbysector dataset :param method_name: Name of method corresponding to flowbysector method yaml name :return: flowbysector """ log.info("Initiating flowbysector creation for " + method_name) # call on method method = load_method(method_name) # create dictionary of data and allocation datasets fb = method['source_names'] # Create empty list for storing fbs files fbs_list = [] for k, v in fb.items(): # pull fba data for allocation flows = load_source_dataframe(k, v) if v['data_format'] == 'FBA': # ensure correct datatypes and that all fields exist flows = clean_df(flows, flow_by_activity_fields, fba_fill_na_dict, drop_description=False) # clean up fba, if specified in yaml if v["clean_fba_df_fxn"] != 'None': log.info("Cleaning up " + k + " FlowByActivity") flows = getattr(sys.modules[__name__], v["clean_fba_df_fxn"])(flows) # if activity_sets are specified in a file, call them here if 'activity_set_file' in v: aset_names = pd.read_csv(flowbysectoractivitysetspath + v['activity_set_file'], dtype=str) # create dictionary of allocation datasets for different activities activities = v['activity_sets'] # subset activity data and allocate to sector for aset, attr in activities.items(): # subset by named activities if 'activity_set_file' in v: names = aset_names[aset_names['activity_set'] == aset]['name'] else: names = attr['names'] log.info("Preparing to handle subset of flownames " + ', '.join(map(str, names)) + " in " + k) # subset fba data by activity flows_subset = flows[ (flows[fba_activity_fields[0]].isin(names)) | (flows[fba_activity_fields[1]].isin(names))].reset_index( drop=True) # extract relevant geoscale data or aggregate existing data log.info("Subsetting/aggregating dataframe to " + attr['allocation_from_scale'] + " geoscale") flows_subset_geo = subset_df_by_geoscale( flows_subset, v['geoscale_to_use'], attr['allocation_from_scale']) # Add sectors to df activity, depending on level of specified sector aggregation log.info("Adding sectors to " + k) flow_subset_wsec = add_sectors_to_flowbyactivity( flows_subset_geo, sectorsourcename=method['target_sector_source'], allocationmethod=attr['allocation_method']) # clean up fba with sectors, if specified in yaml if v["clean_fba_w_sec_df_fxn"] != 'None': log.info("Cleaning up " + k + " FlowByActivity with sectors") flow_subset_wsec = getattr(sys.modules[__name__], v["clean_fba_w_sec_df_fxn"])( flow_subset_wsec, attr=attr) # map df to elementary flows log.info("Mapping flows in " + k + ' to federal elementary flow list') if 'fedefl_mapping' in v: mapping_files = v['fedefl_mapping'] else: mapping_files = k flow_subset_mapped = map_elementary_flows( flow_subset_wsec, mapping_files) # clean up mapped fba with sectors, if specified in yaml if "clean_mapped_fba_w_sec_df_fxn" in v: log.info("Cleaning up " + k + " FlowByActivity with sectors") flow_subset_mapped = getattr( sys.modules[__name__], v["clean_mapped_fba_w_sec_df_fxn"])(flow_subset_mapped, attr, method) # if allocation method is "direct", then no need to create alloc ratios, else need to use allocation # dataframe to create sector allocation ratios if attr['allocation_method'] == 'direct': log.info('Directly assigning ' + ', '.join(map(str, names)) + ' to sectors') fbs = flow_subset_mapped.copy() # for each activity, if activities are not sector like, check that there is no data loss if load_source_catalog( )[k]['sector-like_activities'] is False: activity_list = [] for n in names: log.info('Checking for ' + n + ' at ' + method['target_sector_level']) fbs_subset = fbs[( (fbs[fba_activity_fields[0]] == n) & (fbs[fba_activity_fields[1]] == n)) | (fbs[fba_activity_fields[0]] == n) | (fbs[fba_activity_fields[1]] == n )].reset_index(drop=True) fbs_subset = check_if_losing_sector_data( fbs_subset, method['target_sector_level']) activity_list.append(fbs_subset) fbs = pd.concat(activity_list, ignore_index=True) # if allocation method for an activity set requires a specific function due to the complicated nature # of the allocation, call on function here elif attr['allocation_method'] == 'allocation_function': log.info( 'Calling on function specified in method yaml to allocate ' + ', '.join(map(str, names)) + ' to sectors') fbs = getattr(sys.modules[__name__], attr['allocation_source'])( flow_subset_mapped, attr, fbs_list) else: # determine appropriate allocation dataset log.info("Loading allocation flowbyactivity " + attr['allocation_source'] + " for year " + str(attr['allocation_source_year'])) fba_allocation = flowsa.getFlowByActivity( flowclass=[attr['allocation_source_class']], datasource=attr['allocation_source'], years=[attr['allocation_source_year'] ]).reset_index(drop=True) # clean df and harmonize unites fba_allocation = clean_df(fba_allocation, flow_by_activity_fields, fba_fill_na_dict) fba_allocation = harmonize_units(fba_allocation) # check if allocation data exists at specified geoscale to use log.info("Checking if allocation data exists at the " + attr['allocation_from_scale'] + " level") check_if_data_exists_at_geoscale( fba_allocation, attr['allocation_from_scale']) # aggregate geographically to the scale of the flowbyactivty source, if necessary fba_allocation = subset_df_by_geoscale( fba_allocation, attr['allocation_from_scale'], v['geoscale_to_use']) # subset based on yaml settings if attr['allocation_flow'] != 'None': fba_allocation = fba_allocation.loc[ fba_allocation['FlowName'].isin( attr['allocation_flow'])] if attr['allocation_compartment'] != 'None': fba_allocation = fba_allocation.loc[ fba_allocation['Compartment'].isin( attr['allocation_compartment'])] # cleanup the fba allocation df, if necessary if 'clean_allocation_fba' in attr: log.info("Cleaning " + attr['allocation_source']) fba_allocation = getattr(sys.modules[__name__], attr["clean_allocation_fba"])( fba_allocation, attr=attr) # reset index fba_allocation = fba_allocation.reset_index(drop=True) # assign sector to allocation dataset log.info("Adding sectors to " + attr['allocation_source']) fba_allocation_wsec = add_sectors_to_flowbyactivity( fba_allocation, sectorsourcename=method['target_sector_source']) # call on fxn to further clean up/disaggregate the fba allocation data, if exists if 'clean_allocation_fba_w_sec' in attr: log.info("Further disaggregating sectors in " + attr['allocation_source']) fba_allocation_wsec = getattr( sys.modules[__name__], attr["clean_allocation_fba_w_sec"])( fba_allocation_wsec, attr=attr, method=method) # subset fba datasets to only keep the sectors associated with activity subset log.info("Subsetting " + attr['allocation_source'] + " for sectors in " + k) fba_allocation_subset = get_fba_allocation_subset( fba_allocation_wsec, k, names, flowSubsetMapped=flow_subset_mapped, allocMethod=attr['allocation_method']) # if there is an allocation helper dataset, modify allocation df if attr['allocation_helper'] == 'yes': log.info( "Using the specified allocation help for subset of " + attr['allocation_source']) fba_allocation_subset = allocation_helper( fba_allocation_subset, attr, method, v) # create flow allocation ratios for each activity # if load_source_catalog()[k]['sector-like_activities'] flow_alloc_list = [] group_cols = fba_mapped_default_grouping_fields group_cols = [ e for e in group_cols if e not in ('ActivityProducedBy', 'ActivityConsumedBy') ] for n in names: log.info("Creating allocation ratios for " + n) fba_allocation_subset_2 = get_fba_allocation_subset( fba_allocation_subset, k, [n], flowSubsetMapped=flow_subset_mapped, allocMethod=attr['allocation_method']) if len(fba_allocation_subset_2) == 0: log.info("No data found to allocate " + n) else: flow_alloc = allocate_by_sector( fba_allocation_subset_2, k, attr['allocation_source'], attr['allocation_method'], group_cols, flowSubsetMapped=flow_subset_mapped) flow_alloc = flow_alloc.assign(FBA_Activity=n) flow_alloc_list.append(flow_alloc) flow_allocation = pd.concat(flow_alloc_list, ignore_index=True) # generalize activity field names to enable link to main fba source log.info("Generalizing activity columns in subset of " + attr['allocation_source']) flow_allocation = collapse_activity_fields(flow_allocation) # check for issues with allocation ratios check_allocation_ratios(flow_allocation, aset, k, method_name) # create list of sectors in the flow allocation df, drop any rows of data in the flow df that \ # aren't in list sector_list = flow_allocation['Sector'].unique().tolist() # subset fba allocation table to the values in the activity list, based on overlapping sectors flow_subset_mapped = flow_subset_mapped.loc[ (flow_subset_mapped[fbs_activity_fields[0]]. isin(sector_list)) | (flow_subset_mapped[fbs_activity_fields[1]]. isin(sector_list))] # check if fba and allocation dfs have the same LocationSystem log.info( "Checking if flowbyactivity and allocation dataframes use the same location systems" ) check_if_location_systems_match(flow_subset_mapped, flow_allocation) # merge fba df w/flow allocation dataset log.info("Merge " + k + " and subset of " + attr['allocation_source']) fbs = flow_subset_mapped.merge( flow_allocation[[ 'Location', 'Sector', 'FlowAmountRatio', 'FBA_Activity' ]], left_on=[ 'Location', 'SectorProducedBy', 'ActivityProducedBy' ], right_on=['Location', 'Sector', 'FBA_Activity'], how='left') fbs = fbs.merge( flow_allocation[[ 'Location', 'Sector', 'FlowAmountRatio', 'FBA_Activity' ]], left_on=[ 'Location', 'SectorConsumedBy', 'ActivityConsumedBy' ], right_on=['Location', 'Sector', 'FBA_Activity'], how='left') # merge the flowamount columns fbs.loc[:, 'FlowAmountRatio'] = fbs[ 'FlowAmountRatio_x'].fillna(fbs['FlowAmountRatio_y']) # fill null rows with 0 because no allocation info fbs['FlowAmountRatio'] = fbs['FlowAmountRatio'].fillna(0) # check if fba and alloc dfs have data for same geoscales - comment back in after address the 'todo' # log.info("Checking if flowbyactivity and allocation dataframes have data at the same locations") # check_if_data_exists_for_same_geoscales(fbs, k, attr['names']) # drop rows where there is no allocation data fbs = fbs.dropna(subset=['Sector_x', 'Sector_y'], how='all').reset_index() # calculate flow amounts for each sector log.info("Calculating new flow amounts using flow ratios") fbs.loc[:, 'FlowAmount'] = fbs['FlowAmount'] * fbs[ 'FlowAmountRatio'] # drop columns log.info("Cleaning up new flow by sector") fbs = fbs.drop(columns=[ 'Sector_x', 'FlowAmountRatio_x', 'Sector_y', 'FlowAmountRatio_y', 'FlowAmountRatio', 'FBA_Activity_x', 'FBA_Activity_y' ]) # drop rows where flowamount = 0 (although this includes dropping suppressed data) fbs = fbs[fbs['FlowAmount'] != 0].reset_index(drop=True) # define grouping columns dependent on sectors being activity-like or not if load_source_catalog()[k]['sector-like_activities'] is False: groupingcols = fbs_grouping_fields_w_activities groupingdict = flow_by_sector_fields_w_activity else: groupingcols = fbs_default_grouping_fields groupingdict = flow_by_sector_fields # clean df fbs = clean_df(fbs, groupingdict, fbs_fill_na_dict) # aggregate df geographically, if necessary # todo: replace with fxn return_from_scale log.info("Aggregating flowbysector to " + method['target_geoscale'] + " level") if fips_number_key[v['geoscale_to_use']] < fips_number_key[ attr['allocation_from_scale']]: from_scale = v['geoscale_to_use'] else: from_scale = attr['allocation_from_scale'] to_scale = method['target_geoscale'] fbs_geo_agg = agg_by_geoscale(fbs, from_scale, to_scale, groupingcols) # aggregate data to every sector level log.info("Aggregating flowbysector to all sector levels") fbs_sec_agg = sector_aggregation(fbs_geo_agg, groupingcols) # add missing naics5/6 when only one naics5/6 associated with a naics4 fbs_agg = sector_disaggregation(fbs_sec_agg, groupingdict) # check if any sector information is lost before reaching the target sector length, if so, # allocate values equally to disaggregated sectors log.info('Checking for data at ' + method['target_sector_level']) fbs_agg_2 = check_if_losing_sector_data( fbs_agg, method['target_sector_level']) # compare flowbysector with flowbyactivity # todo: modify fxn to work if activities are sector like in df being allocated if load_source_catalog()[k]['sector-like_activities'] is False: check_for_differences_between_fba_load_and_fbs_output( flow_subset_mapped, fbs_agg_2, aset, k, method_name) # return sector level specified in method yaml # load the crosswalk linking sector lengths sector_list = get_sector_list(method['target_sector_level']) # subset df, necessary because not all of the sectors are NAICS and can get duplicate rows fbs_1 = fbs_agg_2.loc[ (fbs_agg_2[fbs_activity_fields[0]].isin(sector_list)) & (fbs_agg_2[fbs_activity_fields[1]].isin(sector_list) )].reset_index(drop=True) fbs_2 = fbs_agg_2.loc[ (fbs_agg_2[fbs_activity_fields[0]].isin(sector_list)) & (fbs_agg_2[fbs_activity_fields[1]].isnull())].reset_index( drop=True) fbs_3 = fbs_agg_2.loc[ (fbs_agg_2[fbs_activity_fields[0]].isnull()) & (fbs_agg_2[fbs_activity_fields[1]].isin(sector_list) )].reset_index(drop=True) fbs_sector_subset = pd.concat([fbs_1, fbs_2, fbs_3]) # drop activity columns fbs_sector_subset = fbs_sector_subset.drop( ['ActivityProducedBy', 'ActivityConsumedBy'], axis=1, errors='ignore') # save comparison of FBA total to FBS total for an activity set compare_fba_load_and_fbs_output_totals(flows_subset_geo, fbs_sector_subset, aset, k, method_name, attr, method, mapping_files) log.info( "Completed flowbysector for activity subset with flows " + ', '.join(map(str, names))) fbs_list.append(fbs_sector_subset) else: # if the loaded flow dt is already in FBS format, append directly to list of FBS log.info("Append " + k + " to FBS list") # ensure correct field datatypes and add any missing fields flows = clean_df(flows, flow_by_sector_fields, fbs_fill_na_dict) fbs_list.append(flows) # create single df of all activities log.info("Concat data for all activities") fbss = pd.concat(fbs_list, ignore_index=True, sort=False) log.info("Clean final dataframe") # aggregate df as activities might have data for the same specified sector length fbss = clean_df(fbss, flow_by_sector_fields, fbs_fill_na_dict) fbss = aggregator(fbss, fbs_default_grouping_fields) # sort df log.info("Sort and store dataframe") # add missing fields, ensure correct data type, reorder columns fbss = fbss.sort_values( ['SectorProducedBy', 'SectorConsumedBy', 'Flowable', 'Context']).reset_index(drop=True) # save parquet file store_flowbysector(fbss, method_name)
def disaggregate_cropland(fba_w_sector, attr): """ In the event there are 4 (or 5) digit naics for cropland at the county level, use state level harvested cropland to create ratios :param fba_w_sector: :param attr: :return: """ import flowsa from flowsa.flowbyfunctions import generalize_activity_field_names, sector_aggregation,\ fbs_default_grouping_fields, clean_df, fba_fill_na_dict, add_missing_flow_by_fields from flowsa.mapping import add_sectors_to_flowbyactivity # drop pastureland data crop = fba_w_sector.loc[fba_w_sector['Sector'].apply(lambda x: str(x[0:3])) != '112'].reset_index(drop=True) # drop sectors < 4 digits crop = crop[crop['Sector'].apply(lambda x: len(x) > 3)].reset_index( drop=True) # create tmp location crop.loc[:, 'Location_tmp'] = crop['Location'].apply(lambda x: str(x[0:2])) # load the relevant state level harvested cropland by naics naics_load = flowsa.getFlowByActivity( flowclass=['Land'], years=[attr['allocation_source_year']], datasource="USDA_CoA_Cropland_NAICS").reset_index(drop=True) # clean df naics = clean_df(naics_load, flow_by_activity_fields, fba_fill_na_dict) # subset the harvested cropland by naics naics = naics[naics['FlowName'] == 'AG LAND, CROPLAND, HARVESTED'].reset_index(drop=True) # add sectors naics = add_sectors_to_flowbyactivity(naics, sectorsourcename='NAICS_2012_Code', levelofSectoragg='agg') # add missing fbs fields naics = add_missing_flow_by_fields(naics, flow_by_sector_fields) # aggregate sectors to create any missing naics levels naics = sector_aggregation(naics, fbs_default_grouping_fields) # add missing naics5/6 when only one naics5/6 associated with a naics4 naics = sector_disaggregation(naics) # drop rows where sector consumed by is none and FlowAmount 0 naics = naics[naics['SectorConsumedBy'].notnull()] naics = naics.loc[naics['FlowAmount'] != 0] # create ratios naics = sector_ratios(naics) # drop sectors < 4 digits #naics = naics[naics['SectorConsumedBy'].apply(lambda x: len(x) > 3)].reset_index(drop=True) # create temporary sector column to match the two dfs on naics.loc[:, 'Location_tmp'] = naics['Location'].apply(lambda x: str(x[0:2])) # for loop through naics lengths to determine naics 4 and 5 digits to disaggregate for i in range(4, 6): # subset df to sectors with length = i and length = i + 1 crop_subset = crop.loc[crop['Sector'].apply( lambda x: i + 1 >= len(x) >= i)] crop_subset.loc[:, 'Sector_tmp'] = crop_subset['Sector'].apply( lambda x: x[0:i]) # if duplicates drop all rows df = crop_subset.drop_duplicates(subset=['Location', 'Sector_tmp'], keep=False).reset_index(drop=True) # drop sector temp column df = df.drop(columns=["Sector_tmp"]) # subset df to keep the sectors of length i df_subset = df.loc[df['Sector'].apply(lambda x: len(x) == i)] # subset the naics df where naics length is i + 1 naics_subset = naics.loc[naics['SectorConsumedBy'].apply( lambda x: len(x) == i + 1)].reset_index(drop=True) naics_subset.loc[:, 'Sector_tmp'] = naics_subset[ 'SectorConsumedBy'].apply(lambda x: x[0:i]) # merge the two df based on locations df_subset = pd.merge(df_subset, naics_subset[[ 'SectorConsumedBy', 'FlowAmountRatio', 'Sector_tmp', 'Location_tmp' ]], how='left', left_on=['Sector', 'Location_tmp'], right_on=['Sector_tmp', 'Location_tmp']) # create flow amounts for the new NAICS based on the flow ratio df_subset.loc[:, 'FlowAmount'] = df_subset['FlowAmount'] * df_subset[ 'FlowAmountRatio'] # drop rows of 0 and na df_subset = df_subset[df_subset['FlowAmount'] != 0] df_subset = df_subset[~df_subset['FlowAmount'].isna()].reset_index( drop=True) # drop columns df_subset = df_subset.drop( columns=['Sector', 'FlowAmountRatio', 'Sector_tmp']) # rename columns df_subset = df_subset.rename(columns={"SectorConsumedBy": "Sector"}) # add new rows of data to crop df crop = pd.concat([crop, df_subset], sort=True).reset_index(drop=True) # clean up df crop = crop.drop(columns=['Location_tmp']) # pasture data pasture = fba_w_sector.loc[fba_w_sector['Sector'].apply( lambda x: str(x[0:3])) == '112'].reset_index(drop=True) # concat crop and pasture fba_w_sector = pd.concat([pasture, crop], sort=True).reset_index(drop=True) return fba_w_sector
def geoscale_flow_comparison(flowclass, years, datasource, activitynames=['all'], to_scale='national'): """ Aggregates county data to state and national, and state data to national level, allowing for comparisons in flow totals for a given flowclass and industry. First assigns all flownames to NAICS and standardizes units. Assigned to NAICS rather than using FlowNames for aggregation to negate any changes in flownames across time/geoscale """ # load parquet file checking aggregation flows = flowsa.getFlowByActivity(flowclass=flowclass, years=years, datasource=datasource) # fill null values flows = flows.fillna(value=fba_fill_na_dict) # convert units flows = convert_unit(flows) # if activityname set to default, then compare aggregation for all activities. If looking at particular activity, # filter that activity out if activitynames == ['all']: flow_subset = flows.copy() else: flow_subset = flows[ (flows[fba_activity_fields[0]].isin(activitynames)) | (flows[fba_activity_fields[1]].isin(activitynames))] # Reset index values after subset flow_subset = flow_subset.reset_index() # pull naics crosswalk mapping = get_activitytosector_mapping(flow_subset['SourceName'].all()) # assign naics to activities # usgs datasource is not easily assigned to naics for checking totals, so instead standardize activity names if datasource == 'USGS_NWIS_WU': flow_subset = standardize_usgs_nwis_names(flow_subset) else: flow_subset = pd.merge(flow_subset, mapping[['Activity', 'Sector']], left_on='ActivityProducedBy', right_on='Activity', how='left').rename( {'Sector': 'SectorProducedBy'}, axis=1) flow_subset = pd.merge(flow_subset, mapping[['Activity', 'Sector']], left_on='ActivityConsumedBy', right_on='Activity', how='left').rename( {'Sector': 'SectorConsumedBy'}, axis=1) flow_subset = flow_subset.drop(columns=[ 'ActivityProducedBy', 'ActivityConsumedBy', 'Activity_x', 'Activity_y', 'Description' ], errors='ignore') flow_subset['SectorProducedBy'] = flow_subset['SectorProducedBy'].replace({ np.nan: None }).astype(str) flow_subset['SectorConsumedBy'] = flow_subset['SectorConsumedBy'].replace({ np.nan: None }).astype(str) # create list of geoscales for aggregation if to_scale == 'national': geoscales = ['national', 'state', 'county'] elif to_scale == 'state': geoscales = ['state', 'county'] # create empty df list flow_dfs = [] for i in geoscales: try: # filter by geoscale fba_from_scale = filter_by_geoscale(flow_subset, i) # remove/add column names as a column group_cols = fba_default_grouping_fields.copy() for j in ['Location', 'ActivityProducedBy', 'ActivityConsumedBy']: group_cols.remove(j) for j in ['SectorProducedBy', 'SectorConsumedBy']: group_cols.append(j) # county sums to state and national, state sums to national if to_scale == 'state': fba_from_scale['Location'] = fba_from_scale['Location'].apply( lambda x: str(x[0:2])) elif to_scale == 'national': fba_from_scale['Location'] = US_FIPS # aggregate fba_agg = aggregator(fba_from_scale, group_cols) # rename flowamount column, based on geoscale fba_agg = fba_agg.rename(columns={"FlowAmount": "FlowAmount_" + i}) # drop fields irrelevant to aggregated flow comparision drop_fields = flows[[ 'MeasureofSpread', 'Spread', 'DistributionType', 'DataReliability', 'DataCollection' ]] fba_agg = fba_agg.drop(columns=drop_fields) # reset index fba_agg = fba_agg.reset_index(drop=True) flow_dfs.append(fba_agg) except: pass # merge list of dfs by column flow_comparison = reduce( lambda left, right: pd.merge( left, right, on=[ 'Class', 'SourceName', 'FlowName', 'Unit', 'SectorProducedBy', 'SectorConsumedBy', 'Compartment', 'Location', 'LocationSystem', 'Year' ], how='outer'), flow_dfs) # sort df flow_comparison = flow_comparison.sort_values([ 'Year', 'Location', 'SectorProducedBy', 'SectorConsumedBy', 'FlowName', 'Compartment' ]) return flow_comparison
def convert_statcan_data_to_US_water_use(df, attr): """ Use Canadian GDP data to convert 3 digit canadian water use to us water use: - canadian gdp - us gdp :return: """ import flowsa from flowsa.values_from_literature import get_Canadian_to_USD_exchange_rate from flowsa.flowbyfunctions import assign_fips_location_system, aggregator, fba_default_grouping_fields from flowsa.common import US_FIPS, load_bea_crosswalk # load Canadian GDP data gdp = flowsa.getFlowByActivity(flowclass=['Money'], datasource='StatCan_GDP', years=[attr['allocation_source_year']]) # drop 31-33 gdp = gdp[gdp['ActivityProducedBy'] != '31-33'] gdp = gdp.rename(columns={"FlowAmount": "CanDollar"}) # merge df df_m = pd.merge(df, gdp[['CanDollar', 'ActivityProducedBy']], how='left', left_on='ActivityConsumedBy', right_on='ActivityProducedBy') df_m['CanDollar'] = df_m['CanDollar'].fillna(0) df_m = df_m.drop(columns=["ActivityProducedBy_y"]) df_m = df_m.rename(columns={"ActivityProducedBy_x": "ActivityProducedBy"}) df_m = df_m[df_m['CanDollar'] != 0] exchange_rate = get_Canadian_to_USD_exchange_rate( str(attr['allocation_source_year'])) exchange_rate = float(exchange_rate) # convert to mgal/USD df_m.loc[:, 'FlowAmount'] = df_m['FlowAmount'] / (df_m['CanDollar'] / exchange_rate) df_m.loc[:, 'Unit'] = 'Mgal/USD' df_m = df_m.drop(columns=["CanDollar"]) # convert Location to US df_m.loc[:, 'Location'] = US_FIPS df_m = assign_fips_location_system(df_m, str(attr['allocation_source_year'])) # load us gdp # load Canadian GDP data us_gdp_load = flowsa.getFlowByActivity( flowclass=['Money'], datasource='BEA_GDP_GrossOutput_IO', years=[attr['allocation_source_year']]) # load bea crosswalk cw_load = load_bea_crosswalk() cw = cw_load[['BEA_2012_Detail_Code', 'NAICS_2012_Code']].drop_duplicates() cw = cw[cw['NAICS_2012_Code'].apply( lambda x: len(str(x)) == 3)].drop_duplicates().reset_index(drop=True) # merge us_gdp = pd.merge(us_gdp_load, cw, how='left', left_on='ActivityProducedBy', right_on='BEA_2012_Detail_Code') us_gdp = us_gdp.drop( columns=['ActivityProducedBy', 'BEA_2012_Detail_Code']) # rename columns us_gdp = us_gdp.rename(columns={'NAICS_2012_Code': 'ActivityProducedBy'}) # agg by naics us_gdp = aggregator(us_gdp, fba_default_grouping_fields) us_gdp = us_gdp.rename(columns={'FlowAmount': 'us_gdp'}) # determine annual us water use df_m2 = pd.merge(df_m, us_gdp[['ActivityProducedBy', 'us_gdp']], how='left', left_on='ActivityConsumedBy', right_on='ActivityProducedBy') df_m2.loc[:, 'FlowAmount'] = df_m2['FlowAmount'] * (df_m2['us_gdp']) df_m2.loc[:, 'Unit'] = 'Mgal' df_m2 = df_m2.rename( columns={'ActivityProducedBy_x': 'ActivityProducedBy'}) df_m2 = df_m2.drop(columns=['ActivityProducedBy_y', 'us_gdp']) return df_m2
def allocation_helper(df_w_sector, method, attr): """ Used when two df required to create allocation ratio :param df_w_sector: :param method: currently written for 'multiplication' :param attr: :return: """ from flowsa.mapping import add_sectors_to_flowbyactivity helper_allocation = flowsa.getFlowByActivity(flowclass=[attr['helper_source_class']], datasource=attr['helper_source'], years=[attr['helper_source_year']]) # clean df helper_allocation = clean_df(helper_allocation, flow_by_activity_fields, fba_fill_na_dict) # drop rows with flowamount = 0 helper_allocation = helper_allocation[helper_allocation['FlowAmount'] != 0] # assign naics to allocation dataset helper_allocation = add_sectors_to_flowbyactivity(helper_allocation, sectorsourcename=method[ 'target_sector_source'], levelofSectoragg=attr[ 'helper_sector_aggregation']) # generalize activity field names to enable link to water withdrawal table helper_allocation = generalize_activity_field_names(helper_allocation) # drop columns helper_allocation = helper_allocation.drop(columns=['Activity', 'Min', 'Max']) # rename column helper_allocation = helper_allocation.rename(columns={"FlowAmount": 'HelperFlow'}) # merge allocation df with helper df based on sectors, depending on geo scales of dfs if attr['helper_from_scale'] == 'national': modified_fba_allocation = df_w_sector.merge(helper_allocation[['Sector', 'HelperFlow']], how='left') if (attr['helper_from_scale'] == 'state') and (attr['allocation_from_scale'] == 'state'): modified_fba_allocation = df_w_sector.merge( helper_allocation[['Sector', 'Location', 'HelperFlow']], how='left') if (attr['helper_from_scale'] == 'state') and (attr['allocation_from_scale'] == 'county'): helper_allocation.loc[:, 'Location_tmp'] = helper_allocation['Location'].apply( lambda x: str(x[0:2])) df_w_sector.loc[:, 'Location_tmp'] = df_w_sector['Location'].apply(lambda x: str(x[0:2])) modified_fba_allocation = df_w_sector.merge( helper_allocation[['Sector', 'Location_tmp', 'HelperFlow']], how='left') modified_fba_allocation = modified_fba_allocation.drop(columns=['Location_tmp']) # todo: modify so if missing data, replaced with value from one geoscale up instead of national # if missing values (na or 0), replace with national level values replacement_values = helper_allocation[helper_allocation['Location'] == US_FIPS].reset_index( drop=True) replacement_values = replacement_values.rename(columns={"HelperFlow": 'ReplacementValue'}) modified_fba_allocation = modified_fba_allocation.merge( replacement_values[['Sector', 'ReplacementValue']], how='left') modified_fba_allocation.loc[:, 'HelperFlow'] = modified_fba_allocation['HelperFlow'].fillna( modified_fba_allocation['ReplacementValue']) modified_fba_allocation.loc[:, 'HelperFlow'] = np.where(modified_fba_allocation['HelperFlow'] == 0, modified_fba_allocation['ReplacementValue'], modified_fba_allocation['HelperFlow']) # modify flow amounts using helper data if attr['helper_method'] == 'multiplication': # replace non-existent helper flow values with a 0, so after multiplying, don't have incorrect value associated # with new unit modified_fba_allocation['HelperFlow'] = modified_fba_allocation['HelperFlow'].fillna( value=0) modified_fba_allocation.loc[:, 'FlowAmount'] = modified_fba_allocation['FlowAmount'] * \ modified_fba_allocation[ 'HelperFlow'] # drop columns modified_fba_allocation = modified_fba_allocation.drop( columns=["HelperFlow", 'ReplacementValue']) # drop rows of 0 to speed up allocation modified_fba_allocation = modified_fba_allocation[ modified_fba_allocation['FlowAmount'] != 0].reset_index(drop=True) #todo: modify the unit return modified_fba_allocation
available parameters for getFlowByActivity(). Examples of use of flowsa. Read parquet files as dataframes. :param datasource: str, the code of the datasource. :param year: int, a year, e.g. 2012 :param flowclass: str, a 'Class' of the flow. Optional. E.g. 'Water' :param geographic_level: str, a geographic level of the data. Optional. E.g. 'national', 'state', 'county'. :return: a pandas DataFrame in FlowByActivity format """ import flowsa from flowsa.settings import fbaoutputpath # see all datasources and years available in flowsa flowsa.seeAvailableFlowByModels('FBA') # Load all information for EIA MECS Land fba_mecs = flowsa.getFlowByActivity(datasource="EIA_MECS_Land", year=2014) # only load state level water data and save as csv fba_usgs = flowsa.getFlowByActivity( datasource="USGS_NWIS_WU", year=2015, flowclass='Water', geographic_level='state').reset_index(drop=True) # save output to csv, maintain leading 0s in location col fba_usgs.Location = fba_usgs.Location.apply('="{}"'.format) fba_usgs.to_csv(f"{fbaoutputpath}USGS_NWIS_WU_2015.csv", index=False)
def main(method_name): """ Creates a flowbysector dataset :param method_name: Name of method corresponding to flowbysector method yaml name :return: flowbysector """ log.info("Initiating flowbysector creation for " + method_name) # call on method method = load_method(method_name) # create dictionary of water data and allocation datasets fbas = method['flowbyactivity_sources'] # Create empty list for storing fbs files fbss = [] for k, v in fbas.items(): # pull water data for allocation log.info("Retrieving flowbyactivity for datasource " + k + " in year " + str(v['year'])) flows = flowsa.getFlowByActivity(flowclass=[v['class']], years=[v['year']], datasource=k) # if necessary, standardize names in data set if v['activity_name_standardization_fxn'] != 'None': log.info("Standardizing activity names in " + k) flows = getattr(sys.modules[__name__], v['activity_name_standardization_fxn'])(flows) # drop description field flows = flows.drop(columns='Description') # fill null values flows = flows.fillna(value=fba_fill_na_dict) # map df to elementary flows - commented out until mapping complete # log.info("Mapping flows in " + k + ' to federal elementary flow list') # flows_mapped = map_elementary_flows(flows, k) # convert unit todo: think about unit conversion here log.info("Converting units in " + k) flows = convert_unit(flows) # create dictionary of allocation datasets for different activities activities = v['activity_sets'] for aset, attr in activities.items(): # subset by named activities names = [attr['names']] log.info("Preparing to handle subset of flownames " + ', '.join(map(str, names)) + " in " + k) # subset usgs data by activity flow_subset = flows[(flows[fba_activity_fields[0]].isin(names)) | (flows[fba_activity_fields[1]].isin(names))] # Reset index values after subset flow_subset = flow_subset.reset_index(drop=True) # check if flowbyactivity data exists at specified geoscale to use log.info("Checking if flowbyactivity data exists for " + ', '.join(map(str, names)) + " at the " + v['geoscale_to_use'] + ' level') geocheck = check_if_data_exists_at_geoscale(flow_subset, names, v['geoscale_to_use']) # aggregate geographically to the scale of the allocation dataset if geocheck == "Yes": activity_from_scale = v['geoscale_to_use'] else: # if activity does not exist at specified geoscale, issue warning and use data at less aggregated # geoscale, and sum to specified geoscale log.info("Checking if flowbyactivity data exists for " + ', '.join(map(str, names)) + " at a less aggregated level") new_geoscale_to_use = check_if_data_exists_at_less_aggregated_geoscale(flow_subset, names, v['geoscale_to_use']) activity_from_scale = new_geoscale_to_use activity_to_scale = attr['allocation_from_scale'] # if usgs is less aggregated than allocation df, aggregate usgs activity to target scale if fips_number_key[activity_from_scale] > fips_number_key[activity_to_scale]: log.info("Aggregating subset from " + activity_from_scale + " to " + activity_to_scale) flow_subset = agg_by_geoscale(flow_subset, activity_from_scale, activity_to_scale, fba_default_grouping_fields, names) # else, aggregate to geoscale want to use elif fips_number_key[activity_from_scale] > fips_number_key[v['geoscale_to_use']]: log.info("Aggregating subset from " + activity_from_scale + " to " + v['geoscale_to_use']) flow_subset = agg_by_geoscale(flow_subset, activity_from_scale, v['geoscale_to_use'], fba_default_grouping_fields, names) # else, if usgs is more aggregated than allocation table, filter relevant rows else: log.info("Filtering out " + activity_from_scale + " data") flow_subset = filter_by_geoscale(flow_subset, activity_from_scale, names) # location column pad zeros if necessary flow_subset['Location'] = flow_subset['Location'].apply(lambda x: x.ljust(3 + len(x), '0') if len(x) < 5 else x ) # Add sectors to usgs activity, creating two versions of the flow subset # the first version "flow_subset" is the most disaggregated version of the Sectors (NAICS) # the second version, "flow_subset_agg" includes only the most aggregated level of sectors log.info("Adding sectors to " + k + " for " + ', '.join(map(str, names))) flow_subset_wsec = add_sectors_to_flowbyactivity(flow_subset, sectorsourcename=method['target_sector_source']) flow_subset_wsec_agg = add_sectors_to_flowbyactivity(flow_subset, sectorsourcename=method['target_sector_source'], levelofSectoragg='agg') # if allocation method is "direct", then no need to create alloc ratios, else need to use allocation # dataframe to create sector allocation ratios if attr['allocation_method'] == 'direct': fbs = flow_subset_wsec_agg.copy() else: # determine appropriate allocation dataset log.info("Loading allocation flowbyactivity " + attr['allocation_source'] + " for year " + str(attr['allocation_source_year'])) fba_allocation = flowsa.getFlowByActivity(flowclass=[attr['allocation_source_class']], datasource=attr['allocation_source'], years=[attr['allocation_source_year']]).reset_index(drop=True) # fill null values fba_allocation = fba_allocation.fillna(value=fba_fill_na_dict) # convert unit fba_allocation = convert_unit(fba_allocation) # subset based on yaml settings if attr['allocation_flow'] != 'None': fba_allocation = fba_allocation.loc[fba_allocation['FlowName'].isin(attr['allocation_flow'])] if attr['allocation_compartment'] != 'None': fba_allocation = fba_allocation.loc[ fba_allocation['Compartment'].isin(attr['allocation_compartment'])] # reset index fba_allocation = fba_allocation.reset_index(drop=True) # check if allocation data exists at specified geoscale to use log.info("Checking if" + " allocation data exists for " + ', '.join(map(str, names)) + " at the " + attr['allocation_from_scale'] + " level") check_if_data_exists_at_geoscale(fba_allocation, names, attr['allocation_from_scale']) # aggregate geographically to the scale of the flowbyactivty source, if necessary from_scale = attr['allocation_from_scale'] to_scale = v['geoscale_to_use'] # if allocation df is less aggregated than FBA df, aggregate allocation df to target scale if fips_number_key[from_scale] > fips_number_key[to_scale]: fba_allocation = agg_by_geoscale(fba_allocation, from_scale, to_scale, fba_default_grouping_fields, names) # else, if usgs is more aggregated than allocation table, use usgs as both to and from scale else: fba_allocation = filter_by_geoscale(fba_allocation, from_scale, names) # assign sector to allocation dataset log.info("Adding sectors to " + attr['allocation_source']) fba_allocation = add_sectors_to_flowbyactivity(fba_allocation, sectorsourcename=method['target_sector_source'], levelofSectoragg=attr[ 'allocation_sector_aggregation']) # subset fba datsets to only keep the naics associated with usgs activity subset log.info("Subsetting " + attr['allocation_source'] + " for sectors in " + k) fba_allocation_subset = get_fba_allocation_subset(fba_allocation, k, names) # Reset index values after subset fba_allocation_subset = fba_allocation_subset.reset_index(drop=True) # generalize activity field names to enable link to water withdrawal table log.info("Generalizing activity names in subset of " + attr['allocation_source']) fba_allocation_subset = generalize_activity_field_names(fba_allocation_subset) # drop columns fba_allocation_subset = fba_allocation_subset.drop(columns=['Activity']) # if there is an allocation helper dataset, modify allocation df if attr['allocation_helper'] == 'yes': log.info("Using the specified allocation help for subset of " + attr['allocation_source']) fba_allocation_subset = allocation_helper(fba_allocation_subset, method, attr) # create flow allocation ratios log.info("Creating allocation ratios for " + attr['allocation_source']) flow_allocation = allocate_by_sector(fba_allocation_subset, attr['allocation_method']) # create list of sectors in the flow allocation df, drop any rows of data in the flow df that \ # aren't in list sector_list = flow_allocation['Sector'].unique().tolist() # subset fba allocation table to the values in the activity list, based on overlapping sectors flow_subset_wsec = flow_subset_wsec.loc[ (flow_subset_wsec[fbs_activity_fields[0]].isin(sector_list)) | (flow_subset_wsec[fbs_activity_fields[1]].isin(sector_list))] # check if fba and allocation dfs have the same LocationSystem log.info("Checking if flowbyactivity and allocation dataframes use the same location systems") check_if_location_systems_match(flow_subset_wsec, flow_allocation) # merge water withdrawal df w/flow allocation dataset log.info("Merge " + k + " and subset of " + attr['allocation_source']) fbs = flow_subset_wsec.merge( flow_allocation[['Location', 'LocationSystem', 'Sector', 'FlowAmountRatio']], left_on=['Location', 'LocationSystem', 'SectorProducedBy'], right_on=['Location', 'LocationSystem', 'Sector'], how='left') fbs = fbs.merge( flow_allocation[['Location', 'LocationSystem', 'Sector', 'FlowAmountRatio']], left_on=['Location', 'LocationSystem', 'SectorConsumedBy'], right_on=['Location', 'LocationSystem', 'Sector'], how='left') # drop columns where both sector produced/consumed by in flow allocation dif is null fbs = fbs.dropna(subset=['Sector_x', 'Sector_y'], how='all').reset_index() # merge the flowamount columns fbs['FlowAmountRatio'] = fbs['FlowAmountRatio_x'].fillna(fbs['FlowAmountRatio_y']) fbs['FlowAmountRatio'] = fbs['FlowAmountRatio'].fillna(0) # calculate flow amounts for each sector log.info("Calculating new flow amounts using flow ratios") fbs['FlowAmount'] = fbs['FlowAmount'] * fbs['FlowAmountRatio'] # drop columns log.info("Cleaning up new flow by sector") fbs = fbs.drop(columns=['Sector_x', 'FlowAmountRatio_x', 'Sector_y', 'FlowAmountRatio_y', 'FlowAmountRatio', 'ActivityProducedBy', 'ActivityConsumedBy']) # rename flow name to flowable fbs = fbs.rename(columns={"FlowName": 'Flowable', "Compartment": "Context" }) # drop rows where flowamount = 0 (although this includes dropping suppressed data) fbs = fbs[fbs['FlowAmount'] != 0].reset_index(drop=True) # add missing data columns fbs = add_missing_flow_by_fields(fbs, flow_by_sector_fields) # fill null values fbs = fbs.fillna(value=fbs_fill_na_dict) # aggregate df geographically, if necessary log.info("Aggregating flowbysector to " + method['target_geoscale'] + " level") if fips_number_key[v['geoscale_to_use']] < fips_number_key[attr['allocation_from_scale']]: from_scale = v['geoscale_to_use'] else: from_scale = attr['allocation_from_scale'] to_scale = method['target_geoscale'] fbs = agg_by_geoscale(fbs, from_scale, to_scale, fbs_default_grouping_fields, names) # aggregate data to every sector level log.info("Aggregating flowbysector to " + method['target_sector_level']) fbs = sector_aggregation(fbs, fbs_default_grouping_fields) # test agg by sector sector_agg_comparison = sector_flow_comparision(fbs) # return sector level specified in method yaml # load the crosswalk linking sector lengths cw = load_sector_length_crosswalk() sector_list = cw[method['target_sector_level']].unique().tolist() # add any non-NAICS sectors used with NAICS household = load_household_sector_codes() household = household.loc[household['NAICS_Level_to_Use_For'] == method['target_sector_level']] # add household sector to sector list sector_list.extend(household['Code'].tolist()) # subset df fbs = fbs.loc[(fbs[fbs_activity_fields[0]].isin(sector_list)) | (fbs[fbs_activity_fields[1]].isin(sector_list))].reset_index(drop=True) # add any missing columns of data and cast to appropriate data type fbs = add_missing_flow_by_fields(fbs, flow_by_sector_fields) log.info("Completed flowbysector for activity subset with flows " + ', '.join(map(str, names))) fbss.append(fbs) # create single df of all activities fbss = pd.concat(fbss, ignore_index=True, sort=False) # aggregate df as activities might have data for the same specified sector length fbss = aggregator(fbss, fbs_default_grouping_fields) # sort df fbss = fbss.sort_values( ['SectorProducedBy', 'SectorConsumedBy', 'Flowable', 'Context']).reset_index(drop=True) # save parquet file store_flowbysector(fbss, method_name)
def disaggregate_pastureland(fba_w_sector, attr, method, years_list, sector_column): """ The USDA CoA Cropland irrigated pastureland data only links to the 3 digit NAICS '112'. This function uses state level CoA 'Land in Farms' to allocate the county level acreage data to 6 digit NAICS. :param fba_w_sector: The CoA Cropland dataframe after linked to sectors :param attr: :param years_list: :param sector_column: The sector column on which to make df modifications (SectorProducedBy or SectorConsumedBy) :return: The CoA cropland dataframe with disaggregated pastureland data """ import flowsa from flowsa.flowbyfunctions import allocate_by_sector, clean_df, flow_by_activity_fields, \ fba_fill_na_dict, replace_strings_with_NoneType, replace_NoneType_with_empty_cells, \ fba_mapped_default_grouping_fields, harmonize_units from flowsa.mapping import add_sectors_to_flowbyactivity # tmp drop NoneTypes fba_w_sector = replace_NoneType_with_empty_cells(fba_w_sector) # subset the coa data so only pastureland p = fba_w_sector.loc[fba_w_sector[sector_column].apply(lambda x: x[0:3]) == '112'].reset_index(drop=True) if len(p) != 0: # add temp loc column for state fips p = p.assign(Location_tmp=p['Location'].apply(lambda x: x[0:2])) df_sourcename = pd.unique(p['SourceName'])[0] # load usda coa cropland naics df_class = ['Land'] df_years = years_list df_allocation = 'USDA_CoA_Cropland_NAICS' df_f = flowsa.getFlowByActivity(flowclass=df_class, years=df_years, datasource=df_allocation) df_f = clean_df(df_f, flow_by_activity_fields, fba_fill_na_dict) df_f = harmonize_units(df_f) # subset to land in farms data df_f = df_f[df_f['FlowName'] == 'FARM OPERATIONS'] # subset to rows related to pastureland df_f = df_f.loc[df_f['ActivityConsumedBy'].apply(lambda x: x[0:3]) == '112'] # drop rows with "&' df_f = df_f[~df_f['ActivityConsumedBy'].str.contains('&')] # create sector columns df_f = add_sectors_to_flowbyactivity(df_f, sectorsourcename=method['target_sector_source']) # create proportional ratios group_cols = fba_mapped_default_grouping_fields group_cols = [e for e in group_cols if e not in ('ActivityProducedBy', 'ActivityConsumedBy')] df_f = allocate_by_sector(df_f, df_sourcename, df_allocation, 'proportional', group_cols) # tmp drop NoneTypes df_f = replace_NoneType_with_empty_cells(df_f) # drop naics = '11 df_f = df_f[df_f[sector_column] != '11'] # drop 000 in location df_f = df_f.assign(Location=df_f['Location'].apply(lambda x: x[0:2])) # merge the coa pastureland data with land in farm data df = p.merge(df_f[[sector_column, 'Location', 'FlowAmountRatio']], how='left', left_on="Location_tmp", right_on="Location") # multiply the flowamount by the flowratio df.loc[:, 'FlowAmount'] = df['FlowAmount'] * df['FlowAmountRatio'] # drop columns and rename df = df.drop(columns=['Location_tmp', sector_column + '_x', 'Location_y', 'FlowAmountRatio']) df = df.rename(columns={sector_column + '_y': sector_column, "Location_x": 'Location'}) # drop rows where sector = 112 and then concat with original fba_w_sector fba_w_sector = fba_w_sector[fba_w_sector[sector_column].apply(lambda x: x[0:3]) != '112'].reset_index(drop=True) fba_w_sector = pd.concat([fba_w_sector, df], sort=True).reset_index(drop=True) # fill empty cells with NoneType fba_w_sector = replace_strings_with_NoneType(fba_w_sector) return fba_w_sector
def allocation_helper(df_w_sector, method, attr, v): """ Used when two df required to create allocation ratio :param df_w_sector: :param method: currently written for 'multiplication' and 'proportional' :param attr: :return: """ from flowsa.Blackhurst_IO import scale_blackhurst_results_to_usgs_values from flowsa.BLS_QCEW import clean_bls_qcew_fba, bls_clean_allocation_fba_w_sec from flowsa.mapping import add_sectors_to_flowbyactivity helper_allocation = flowsa.getFlowByActivity(flowclass=[attr['helper_source_class']], datasource=attr['helper_source'], years=[attr['helper_source_year']]) if 'clean_helper_fba' in attr: log.info("Cleaning " + attr['helper_source'] + ' FBA') # tmp hard coded - need to generalize if attr['helper_source'] == 'BLS_QCEW': helper_allocation = clean_bls_qcew_fba(helper_allocation, attr) # helper_allocation = getattr(sys.modules[__name__], attr["clean_helper_fba"])(helper_allocation, attr) # clean df helper_allocation = clean_df(helper_allocation, flow_by_activity_fields, fba_fill_na_dict) # drop rows with flowamount = 0 helper_allocation = helper_allocation[helper_allocation['FlowAmount'] != 0] # agg data if necessary or filter # determine to scale to_scale = min(fips_number_key[attr['allocation_from_scale']], fips_number_key[v['geoscale_to_use']]) if fips_number_key[attr['helper_from_scale']] > to_scale: helper_allocation = agg_by_geoscale(helper_allocation, attr['helper_from_scale'], list(fips_number_key.keys())[list(fips_number_key.values()).index(to_scale)], fba_default_grouping_fields) else: helper_allocation = filter_by_geoscale(helper_allocation, attr['helper_from_scale']) # assign naics to allocation dataset helper_allocation = add_sectors_to_flowbyactivity(helper_allocation, sectorsourcename=method['target_sector_source']) # generalize activity field names to enable link to water withdrawal table helper_allocation = generalize_activity_field_names(helper_allocation) # clean up helper fba with sec if 'clean_helper_fba_wsec' in attr: log.info("Cleaning " + attr['helper_source'] + ' FBA with sectors') # tmp hard coded - need to generalize if attr['helper_source'] == 'BLS_QCEW': helper_allocation = bls_clean_allocation_fba_w_sec(helper_allocation, attr, method) # helper_allocation = getattr(sys.modules[__name__], attr["clean_helper_fba_wsec"])(helper_allocation, attr, method) # drop columns helper_allocation = helper_allocation.drop(columns=['Activity', 'Min', 'Max']) if attr['helper_method'] == 'proportional': # if calculating proportion, first subset the helper allocation df to only contain relevant sectors # create list of sectors in the flow allocation df, drop any rows of data in the flow df that \ # aren't in list sector_list = df_w_sector['Sector'].unique().tolist() # subset fba allocation table to the values in the activity list, based on overlapping sectors helper_allocation = helper_allocation.loc[helper_allocation['Sector'].isin(sector_list)] # calculate proportional ratios helper_allocation = proportional_allocation_by_location_and_sector(helper_allocation, 'Sector') # rename column helper_allocation = helper_allocation.rename(columns={"FlowAmount": 'HelperFlow'}) merge_columns = [e for e in ['Location','Sector', 'HelperFlow', 'FlowAmountRatio'] if e in helper_allocation.columns.values.tolist()] # merge allocation df with helper df based on sectors, depending on geo scales of dfs if (attr['helper_from_scale'] == 'state') and (attr['allocation_from_scale'] == 'county'): helper_allocation.loc[:, 'Location_tmp'] = helper_allocation['Location'].apply(lambda x: x[0:2]) df_w_sector.loc[:, 'Location_tmp'] = df_w_sector['Location'].apply(lambda x: x[0:2]) merge_columns.append('Location_tmp') modified_fba_allocation = df_w_sector.merge(helper_allocation[merge_columns], how='left') modified_fba_allocation = modified_fba_allocation.drop(columns=['Location_tmp']) else: modified_fba_allocation = df_w_sector.merge(helper_allocation[merge_columns], how='left') # modify flow amounts using helper data if 'multiplication' in attr['helper_method']: # todo: modify so if missing data, replaced with value from one geoscale up instead of national # todo: modify year after merge if necessary # if missing values (na or 0), replace with national level values replacement_values = helper_allocation[helper_allocation['Location'] == US_FIPS].reset_index( drop=True) replacement_values = replacement_values.rename(columns={"HelperFlow": 'ReplacementValue'}) modified_fba_allocation = modified_fba_allocation.merge( replacement_values[['Sector', 'ReplacementValue']], how='left') modified_fba_allocation.loc[:, 'HelperFlow'] = modified_fba_allocation['HelperFlow'].fillna( modified_fba_allocation['ReplacementValue']) modified_fba_allocation.loc[:, 'HelperFlow'] = np.where(modified_fba_allocation['HelperFlow'] == 0, modified_fba_allocation['ReplacementValue'], modified_fba_allocation['HelperFlow']) # replace non-existent helper flow values with a 0, so after multiplying, don't have incorrect value associated # with new unit modified_fba_allocation['HelperFlow'] = modified_fba_allocation['HelperFlow'].fillna(value=0) modified_fba_allocation.loc[:, 'FlowAmount'] = modified_fba_allocation['FlowAmount'] * \ modified_fba_allocation['HelperFlow'] # drop columns modified_fba_allocation = modified_fba_allocation.drop(columns=["HelperFlow", 'ReplacementValue']) elif attr['helper_method'] == 'proportional': modified_fba_allocation['FlowAmountRatio'] = modified_fba_allocation['FlowAmountRatio'].fillna(0) modified_fba_allocation.loc[:, 'FlowAmount'] = modified_fba_allocation['FlowAmount'] * \ modified_fba_allocation['FlowAmountRatio'] modified_fba_allocation = modified_fba_allocation.drop(columns=["HelperFlow", 'FlowAmountRatio']) # drop rows of 0 modified_fba_allocation = modified_fba_allocation[modified_fba_allocation['FlowAmount'] != 0].reset_index(drop=True) # todo: change units modified_fba_allocation.loc[modified_fba_allocation['Unit'] == 'gal/employee', 'Unit'] = 'gal' # option to scale up fba values if 'scaled' in attr['helper_method']: log.info("Scaling " + attr['helper_source'] + ' to FBA values') # tmp hard coded - need to generalize if attr['helper_source'] == 'BLS_QCEW': modified_fba_allocation = scale_blackhurst_results_to_usgs_values(modified_fba_allocation, attr) # modified_fba_allocation = getattr(sys.modules[__name__], attr["scale_helper_results"])(modified_fba_allocation, attr) return modified_fba_allocation
def main(method_name): """ Creates a flowbysector dataset :param method_name: Name of method corresponding to flowbysector method yaml name :return: flowbysector """ log.info("Initiating flowbysector creation for " + method_name) # call on method method = load_method(method_name) # create dictionary of data and allocation datasets fb = method['source_names'] # Create empty list for storing fbs files fbss = [] for k, v in fb.items(): # pull fba data for allocation flows = load_source_dataframe(k, v) if v['data_format'] == 'FBA': # clean up fba, if specified in yaml if v["clean_fba_df_fxn"] != 'None': log.info("Cleaning up " + k + " FlowByActivity") flows = getattr(sys.modules[__name__], v["clean_fba_df_fxn"])(flows) flows = clean_df(flows, flow_by_activity_fields, fba_fill_na_dict) # create dictionary of allocation datasets for different activities activities = v['activity_sets'] # subset activity data and allocate to sector for aset, attr in activities.items(): # subset by named activities names = attr['names'] log.info("Preparing to handle subset of flownames " + ', '.join(map(str, names)) + " in " + k) # check if flowbyactivity data exists at specified geoscale to use flow_subset_list = [] for n in names: # subset usgs data by activity flow_subset = flows[(flows[fba_activity_fields[0]] == n) | (flows[fba_activity_fields[1]] == n)].reset_index(drop=True) log.info("Checking if flowbyactivity data exists for " + n + " at the " + v['geoscale_to_use'] + ' level') geocheck = check_if_data_exists_at_geoscale(flow_subset, v['geoscale_to_use'], activitynames=n) # aggregate geographically to the scale of the allocation dataset if geocheck == "Yes": activity_from_scale = v['geoscale_to_use'] else: # if activity does not exist at specified geoscale, issue warning and use data at less aggregated # geoscale, and sum to specified geoscale log.info("Checking if flowbyactivity data exists for " + n + " at a less aggregated level") activity_from_scale = check_if_data_exists_at_less_aggregated_geoscale(flow_subset, v['geoscale_to_use'], n) activity_to_scale = attr['allocation_from_scale'] # if df is less aggregated than allocation df, aggregate usgs activity to allocation geoscale if fips_number_key[activity_from_scale] > fips_number_key[activity_to_scale]: log.info("Aggregating subset from " + activity_from_scale + " to " + activity_to_scale) flow_subset = agg_by_geoscale(flow_subset, activity_from_scale, activity_to_scale, fba_default_grouping_fields, n) # else, aggregate to geoscale want to use elif fips_number_key[activity_from_scale] > fips_number_key[v['geoscale_to_use']]: log.info("Aggregating subset from " + activity_from_scale + " to " + v['geoscale_to_use']) flow_subset = agg_by_geoscale(flow_subset, activity_from_scale, v['geoscale_to_use'], fba_default_grouping_fields, n) # else, if usgs is more aggregated than allocation table, filter relevant rows else: log.info("Subsetting " + activity_from_scale + " data") flow_subset = filter_by_geoscale(flow_subset, activity_from_scale, n) # Add sectors to df activity, depending on level of specified sector aggregation log.info("Adding sectors to " + k + " for " + n) flow_subset_wsec = add_sectors_to_flowbyactivity(flow_subset, sectorsourcename=method['target_sector_source'], levelofSectoragg=attr['activity_sector_aggregation']) flow_subset_list.append(flow_subset_wsec) flow_subset_wsec = pd.concat(flow_subset_list, sort=False).reset_index(drop=True) # clean up fba with sectors, if specified in yaml if v["clean_fba_w_sec_df_fxn"] != 'None': log.info("Cleaning up " + k + " FlowByActivity with sectors") flow_subset_wsec = getattr(sys.modules[__name__], v["clean_fba_w_sec_df_fxn"])(flow_subset_wsec, attr) # map df to elementary flows - commented out until mapping complete log.info("Mapping flows in " + k + ' to federal elementary flow list') flow_subset_wsec = map_elementary_flows(flow_subset_wsec, k) # if allocation method is "direct", then no need to create alloc ratios, else need to use allocation # dataframe to create sector allocation ratios if attr['allocation_method'] == 'direct': log.info('Directly assigning ' + ', '.join(map(str, names)) + ' to sectors') fbs = flow_subset_wsec.copy() else: # determine appropriate allocation dataset log.info("Loading allocation flowbyactivity " + attr['allocation_source'] + " for year " + str(attr['allocation_source_year'])) fba_allocation = flowsa.getFlowByActivity(flowclass=[attr['allocation_source_class']], datasource=attr['allocation_source'], years=[attr['allocation_source_year']]).reset_index(drop=True) fba_allocation = clean_df(fba_allocation, flow_by_activity_fields, fba_fill_na_dict) # subset based on yaml settings if attr['allocation_flow'] != 'None': fba_allocation = fba_allocation.loc[fba_allocation['FlowName'].isin(attr['allocation_flow'])] if attr['allocation_compartment'] != 'None': fba_allocation = fba_allocation.loc[ fba_allocation['Compartment'].isin(attr['allocation_compartment'])] # cleanup the fba allocation df, if necessary if 'clean_allocation_fba' in attr: log.info("Cleaning " + attr['allocation_source']) fba_allocation = getattr(sys.modules[__name__], attr["clean_allocation_fba"])(fba_allocation) # reset index fba_allocation = fba_allocation.reset_index(drop=True) # check if allocation data exists at specified geoscale to use log.info("Checking if allocation data exists at the " + attr['allocation_from_scale'] + " level") check_if_data_exists_at_geoscale(fba_allocation, attr['allocation_from_scale']) # aggregate geographically to the scale of the flowbyactivty source, if necessary from_scale = attr['allocation_from_scale'] to_scale = v['geoscale_to_use'] # if allocation df is less aggregated than FBA df, aggregate allocation df to target scale if fips_number_key[from_scale] > fips_number_key[to_scale]: fba_allocation = agg_by_geoscale(fba_allocation, from_scale, to_scale, fba_default_grouping_fields, names) # else, if usgs is more aggregated than allocation table, use usgs as both to and from scale else: fba_allocation = filter_by_geoscale(fba_allocation, from_scale, names) # assign sector to allocation dataset log.info("Adding sectors to " + attr['allocation_source']) fba_allocation = add_sectors_to_flowbyactivity(fba_allocation, sectorsourcename=method['target_sector_source'], levelofSectoragg=attr['allocation_sector_aggregation']) # subset fba datsets to only keep the sectors associated with activity subset log.info("Subsetting " + attr['allocation_source'] + " for sectors in " + k) fba_allocation_subset = get_fba_allocation_subset(fba_allocation, k, names) # generalize activity field names to enable link to main fba source log.info("Generalizing activity columns in subset of " + attr['allocation_source']) fba_allocation_subset = generalize_activity_field_names(fba_allocation_subset) # drop columns fba_allocation_subset = fba_allocation_subset.drop(columns=['Activity']) # call on fxn to further disaggregate the fba allocation data, if exists if 'allocation_disaggregation_fxn' in attr: log.info("Futher disaggregating sectors in " + attr['allocation_source']) fba_allocation_subset = getattr(sys.modules[__name__], attr["allocation_disaggregation_fxn"])(fba_allocation_subset, attr) # if there is an allocation helper dataset, modify allocation df if attr['allocation_helper'] == 'yes': log.info("Using the specified allocation help for subset of " + attr['allocation_source']) fba_allocation_subset = allocation_helper(fba_allocation_subset, method, attr) # create flow allocation ratios log.info("Creating allocation ratios for " + attr['allocation_source']) flow_allocation = allocate_by_sector(fba_allocation_subset, attr['allocation_method']) # create list of sectors in the flow allocation df, drop any rows of data in the flow df that \ # aren't in list sector_list = flow_allocation['Sector'].unique().tolist() # subset fba allocation table to the values in the activity list, based on overlapping sectors flow_subset_wsec = flow_subset_wsec.loc[ (flow_subset_wsec[fbs_activity_fields[0]].isin(sector_list)) | (flow_subset_wsec[fbs_activity_fields[1]].isin(sector_list))] # check if fba and allocation dfs have the same LocationSystem log.info("Checking if flowbyactivity and allocation dataframes use the same location systems") check_if_location_systems_match(flow_subset_wsec, flow_allocation) # merge fba df w/flow allocation dataset log.info("Merge " + k + " and subset of " + attr['allocation_source']) fbs = flow_subset_wsec.merge( flow_allocation[['Location', 'Sector', 'FlowAmountRatio']], left_on=['Location', 'SectorProducedBy'], right_on=['Location', 'Sector'], how='left') fbs = fbs.merge( flow_allocation[['Location', 'Sector', 'FlowAmountRatio']], left_on=['Location', 'SectorConsumedBy'], right_on=['Location', 'Sector'], how='left') # merge the flowamount columns fbs.loc[:, 'FlowAmountRatio'] = fbs['FlowAmountRatio_x'].fillna(fbs['FlowAmountRatio_y']) # check if fba and alloc dfs have data for same geoscales - comment back in after address the 'todo' # log.info("Checking if flowbyactivity and allocation dataframes have data at the same locations") # check_if_data_exists_for_same_geoscales(fbs, k, attr['names']) # drop rows where there is no allocation data fbs = fbs.dropna(subset=['Sector_x', 'Sector_y'], how='all').reset_index() # calculate flow amounts for each sector log.info("Calculating new flow amounts using flow ratios") fbs.loc[:, 'FlowAmount'] = fbs['FlowAmount'] * fbs['FlowAmountRatio'] # drop columns log.info("Cleaning up new flow by sector") fbs = fbs.drop(columns=['Sector_x', 'FlowAmountRatio_x', 'Sector_y', 'FlowAmountRatio_y', 'FlowAmountRatio', 'ActivityProducedBy', 'ActivityConsumedBy']) # drop rows where flowamount = 0 (although this includes dropping suppressed data) fbs = fbs[fbs['FlowAmount'] != 0].reset_index(drop=True) # clean df fbs = clean_df(fbs, flow_by_sector_fields, fbs_fill_na_dict) # aggregate df geographically, if necessary log.info("Aggregating flowbysector to " + method['target_geoscale'] + " level") if fips_number_key[v['geoscale_to_use']] < fips_number_key[attr['allocation_from_scale']]: from_scale = v['geoscale_to_use'] else: from_scale = attr['allocation_from_scale'] to_scale = method['target_geoscale'] fbs = agg_by_geoscale(fbs, from_scale, to_scale, fbs_default_grouping_fields, names) # aggregate data to every sector level log.info("Aggregating flowbysector to all sector levels") fbs = sector_aggregation(fbs, fbs_default_grouping_fields) # add missing naics5/6 when only one naics5/6 associated with a naics4 fbs = sector_disaggregation(fbs) # test agg by sector # sector_agg_comparison = sector_flow_comparision(fbs) # return sector level specified in method yaml # load the crosswalk linking sector lengths sector_list = get_sector_list(method['target_sector_level']) # add any non-NAICS sectors used with NAICS sector_list = add_non_naics_sectors(sector_list, method['target_sector_level']) # subset df, necessary because not all of the sectors are NAICS and can get duplicate rows fbs_1 = fbs.loc[(fbs[fbs_activity_fields[0]].isin(sector_list)) & (fbs[fbs_activity_fields[1]].isin(sector_list))].reset_index(drop=True) fbs_2 = fbs.loc[(fbs[fbs_activity_fields[0]].isin(sector_list)) | (fbs[fbs_activity_fields[1]].isin(sector_list))].reset_index(drop=True) fbs_sector_subset = pd.concat([fbs_1, fbs_2], sort=False) # set source name fbs_sector_subset.loc[:, 'SectorSourceName'] = method['target_sector_source'] log.info("Completed flowbysector for activity subset with flows " + ', '.join(map(str, names))) fbss.append(fbs_sector_subset) else: # if the loaded flow dt is already in FBS format, append directly to list of FBS log.info("Append " + k + " to FBS list") fbss.append(flows) # create single df of all activities log.info("Concat data for all activities") fbss = pd.concat(fbss, ignore_index=True, sort=False) log.info("Clean final dataframe") # aggregate df as activities might have data for the same specified sector length fbss = aggregator(fbss, fbs_default_grouping_fields) # sort df log.info("Sort and store dataframe") fbss = fbss.replace({'nan': None}) # add missing fields, ensure correct data type, reorder columns fbss = clean_df(fbss, flow_by_sector_fields, fbs_fill_na_dict) fbss = fbss.sort_values( ['SectorProducedBy', 'SectorConsumedBy', 'Flowable', 'Context']).reset_index(drop=True) # save parquet file store_flowbysector(fbss, method_name)
def disaggregate_cropland(fba_w_sector, attr, method, years_list, sector_column): """ In the event there are 4 (or 5) digit naics for cropland at the county level, use state level harvested cropland to create ratios :param fba_w_sector: :param attr: :param years_list: :param sector_column: The sector column on which to make df modifications (SectorProducedBy or SectorConsumedBy) :param attr: :return: """ import flowsa from flowsa.flowbyfunctions import sector_aggregation,\ fbs_default_grouping_fields, clean_df, fba_fill_na_dict, fbs_fill_na_dict, add_missing_flow_by_fields,\ sector_disaggregation, sector_ratios, replace_strings_with_NoneType, replace_NoneType_with_empty_cells,\ harmonize_units from flowsa.mapping import add_sectors_to_flowbyactivity # tmp drop NoneTypes fba_w_sector = replace_NoneType_with_empty_cells(fba_w_sector) # drop pastureland data crop = fba_w_sector.loc[fba_w_sector[sector_column].apply(lambda x: x[0:3]) != '112'].reset_index(drop=True) # drop sectors < 4 digits crop = crop[crop[sector_column].apply(lambda x: len(x) > 3)].reset_index(drop=True) # create tmp location crop = crop.assign(Location_tmp=crop['Location'].apply(lambda x: x[0:2]))\ # load the relevant state level harvested cropland by naics naics_load = flowsa.getFlowByActivity(flowclass=['Land'], years=years_list, datasource="USDA_CoA_Cropland_NAICS").reset_index(drop=True) # clean df naics = clean_df(naics_load, flow_by_activity_fields, fba_fill_na_dict) naics = harmonize_units(naics) # subset the harvested cropland by naics naics = naics[naics['FlowName'] == 'AG LAND, CROPLAND, HARVESTED'].reset_index(drop=True) # drop the activities that include '&' naics = naics[~naics['ActivityConsumedBy'].str.contains('&')].reset_index(drop=True) # add sectors naics = add_sectors_to_flowbyactivity(naics, sectorsourcename=method['target_sector_source']) # add missing fbs fields naics = clean_df(naics, flow_by_sector_fields, fbs_fill_na_dict) # drop cols and rename # naics = naics.drop(columns=["SectorProducedBy"]) # naics = naics.rename(columns={"SectorConsumedBy": sector_column}) # aggregate sectors to create any missing naics levels group_cols = fbs_default_grouping_fields # group_cols = [e for e in group_cols if e not in ('SectorProducedBy', 'SectorConsumedBy')] # group_cols.append(sector_column) naics2 = sector_aggregation(naics, group_cols) # add missing naics5/6 when only one naics5/6 associated with a naics4 naics3 = sector_disaggregation(naics2, group_cols) # drop rows where FlowAmount 0 # naics3 = naics3[~((naics3['SectorProducedBy'] == '') & (naics3['SectorConsumedBy'] == ''))] naics3 = naics3.loc[naics3['FlowAmount'] != 0] # create ratios naics4 = sector_ratios(naics3, sector_column) # create temporary sector column to match the two dfs on naics4 = naics4.assign(Location_tmp=naics4['Location'].apply(lambda x: x[0:2])) # tmp drop Nonetypes naics4 = replace_NoneType_with_empty_cells(naics4) # for loop through naics lengths to determine naics 4 and 5 digits to disaggregate for i in range(4, 6): # subset df to sectors with length = i and length = i + 1 crop_subset = crop.loc[crop[sector_column].apply(lambda x: i+1 >= len(x) >= i)] crop_subset = crop_subset.assign(Sector_tmp=crop_subset[sector_column].apply(lambda x: x[0:i])) # if duplicates drop all rows df = crop_subset.drop_duplicates(subset=['Location', 'Sector_tmp'], keep=False).reset_index(drop=True) # drop sector temp column df = df.drop(columns=["Sector_tmp"]) # subset df to keep the sectors of length i df_subset = df.loc[df[sector_column].apply(lambda x: len(x) == i)] # subset the naics df where naics length is i + 1 naics_subset = naics4.loc[naics4[sector_column].apply(lambda x: len(x) == i+1)].reset_index(drop=True) naics_subset = naics_subset.assign(Sector_tmp=naics_subset[sector_column].apply(lambda x: x[0:i])) # merge the two df based on locations df_subset = pd.merge(df_subset, naics_subset[[sector_column, 'FlowAmountRatio', 'Sector_tmp', 'Location_tmp']], how='left', left_on=[sector_column, 'Location_tmp'], right_on=['Sector_tmp', 'Location_tmp']) # create flow amounts for the new NAICS based on the flow ratio df_subset.loc[:, 'FlowAmount'] = df_subset['FlowAmount'] * df_subset['FlowAmountRatio'] # drop rows of 0 and na df_subset = df_subset[df_subset['FlowAmount'] != 0] df_subset = df_subset[~df_subset['FlowAmount'].isna()].reset_index(drop=True) # drop columns df_subset = df_subset.drop(columns=[sector_column + '_x', 'FlowAmountRatio', 'Sector_tmp']) # rename columns df_subset = df_subset.rename(columns={sector_column + '_y': sector_column}) # tmp drop Nonetypes df_subset = replace_NoneType_with_empty_cells(df_subset) # add new rows of data to crop df crop = pd.concat([crop, df_subset], sort=True).reset_index(drop=True) # clean up df crop = crop.drop(columns=['Location_tmp']) # pasture data pasture = fba_w_sector.loc[fba_w_sector[sector_column].apply(lambda x: x[0:3]) == '112'].reset_index(drop=True) # concat crop and pasture fba_w_sector = pd.concat([pasture, crop], sort=True).reset_index(drop=True) # fill empty cells with NoneType fba_w_sector = replace_strings_with_NoneType(fba_w_sector) return fba_w_sector
""" Write the csv called on in flowbysectormethods yaml files for land use related EIA CBECS """ import flowsa from flowsa.common import flowbysectoractivitysetspath as_year = '2012' if __name__ == '__main__': # define mecs land fba parameters datasource = 'EIA_CBECS_Land' # Read BLM PLS crosswalk df_import = flowsa.getFlowByActivity(datasource, as_year) # drop unused columns df = df_import[['ActivityConsumedBy']].drop_duplicates() # drop 'all buildings' to avoid double counting df = df[~df['ActivityConsumedBy']. isin(['All buildings', 'Mercantile', 'Health care'])] # rename columns df = df.rename(columns={"ActivityConsumedBy": "name"}) # assign column values df = df.assign(activity_set='activity_set_1') df = df.assign(note='') # reorder dataframe
# __init__.py (flowsa) # !/usr/bin/env python3 # coding=utf-8 # [email protected] """ Examples of use of flowsa. Read parquet files as dataframes. :param flowclass: list, a list of`Class' of the flow. required. E.g. ['Water'] or ['Land', 'Other'] :param year: list, a list of years [2015], or [2010,2011,2012] :param datasource: str, the code of the datasource. :return: a pandas DataFrame in FlowByActivity format """ import flowsa # "employment" based datasets employ_bls_flowsbyactivity_2012 = flowsa.getFlowByActivity(flowclass=['Employment'], years=[2012], datasource="BLS_QCEW") employ_bls_flowsbyactivity_2015 = flowsa.getFlowByActivity(flowclass=['Employment', 'Money'], years=[2015], datasource="BLS_QCEW") employ_cpb_flowsbyactivity_2012 = flowsa.getFlowByActivity(flowclass=['Employment', 'Other'], years=[2012], datasource="Census_CBP") # "land" based datasets cropland_flowsbyactivity_2017 = flowsa.getFlowByActivity(flowclass=['Land'], years=[2017], datasource="USDA_CoA_Cropland") # "money" based datasets fisheries_noaa_flowsbyactivity = flowsa.getFlowByActivity(flowclass=['Money'], years=["2012-2018"], datasource="NOAA_FisheryLandings")