def getFlowByActivity(flowclass, years, datasource, geographic_level='all', file_location='local'): """ Retrieves stored data in the FlowByActivity format :param flowclass: list, a list of`Class' of the flow. required. E.g. ['Water'] or ['Land', 'Other'] :param year: list, a list of years [2015], or [2010,2011,2012] :param datasource: str, the code of the datasource. :param geographic_level: 'all', 'national', 'state', 'county'. Default is 'all' :param file_location: 'local' or 'remote'. Default is 'local' :return: a pandas DataFrame in FlowByActivity format """ fbas = pd.DataFrame() for y in years: # definitions fba_file = datasource + "_" + str(y) + ".parquet" local_file_path = fbaoutputpath + fba_file remote_file_path = 'https://edap-ord-data-commons.s3.amazonaws.com/flowsa/FlowByActivity/' + fba_file # load data if file_location == 'local': fba = load_file(fba_file, local_file_path, remote_file_path) else: log.info('Loading ' + datasource + ' from remote server') fba = pd.read_parquet(remote_file_path) fba = fba[fba['Class'].isin(flowclass)] # if geographic level specified, only load rows in geo level if geographic_level != 'all': fba = filter_by_geoscale(fba, geographic_level) # concat dfs fbas = pd.concat([fbas, fba], sort=False) return fbas
def convert_blackhurst_data_to_gal_per_employee(df_wsec, attr, method): import flowsa from flowsa.mapping import add_sectors_to_flowbyactivity from flowsa.flowbyfunctions import clean_df, fba_fill_na_dict, agg_by_geoscale, fba_default_grouping_fields, \ sector_ratios, proportional_allocation_by_location_and_sector, filter_by_geoscale from flowsa.BLS_QCEW import clean_bls_qcew_fba bls = flowsa.getFlowByActivity(flowclass=['Employment'], datasource='BLS_QCEW', years=[2002]) # clean df bls = clean_df(bls, flow_by_activity_fields, fba_fill_na_dict) bls = clean_bls_qcew_fba(bls, attr) # bls_agg = agg_by_geoscale(bls, 'state', 'national', fba_default_grouping_fields) bls_agg = filter_by_geoscale(bls, 'national') # assign naics to allocation dataset bls_wsec = add_sectors_to_flowbyactivity( bls_agg, sectorsourcename=method['target_sector_source']) # drop rows where sector = None ( does not occur with mining) bls_wsec = bls_wsec[~bls_wsec['SectorProducedBy'].isnull()] bls_wsec = bls_wsec.rename(columns={'SectorProducedBy': 'Sector'}) # create list of sectors in the flow allocation df, drop any rows of data in the flow df that \ # aren't in list sector_list = df_wsec['Sector'].unique().tolist() # subset fba allocation table to the values in the activity list, based on overlapping sectors bls_wsec = bls_wsec.loc[bls_wsec['Sector'].isin(sector_list)] # calculate proportional ratios bls_wsec = proportional_allocation_by_location_and_sector( bls_wsec, 'Sector') #, 'agg') bls_wsec = bls_wsec.rename(columns={ 'FlowAmountRatio': 'EmployeeRatio', 'FlowAmount': 'Employees' }) # merge the two dfs df = pd.merge(df_wsec, bls_wsec[['Sector', 'EmployeeRatio', 'Employees']], how='left', left_on='Sector', right_on='Sector') df['EmployeeRatio'] = df['EmployeeRatio'].fillna(0) # calculate gal/employee in 2002 df.loc[:, 'FlowAmount'] = (df['FlowAmount'] * df['EmployeeRatio']) / df['Employees'] df.loc[:, 'Unit'] = 'gal/employee' # drop cols df = df.drop(columns=['Employees', 'EmployeeRatio']) return df
def getFlowByActivity(datasource, year, flowclass=None, geographic_level=None, download_if_missing=DEFAULT_DOWNLOAD_IF_MISSING): """ Retrieves stored data in the FlowByActivity format :param datasource: str, the code of the datasource. :param year: int, a year, e.g. 2012 :param flowclass: str, a 'Class' of the flow. Optional. E.g. 'Water' :param geographic_level: str, a geographic level of the data. Optional. E.g. 'national', 'state', 'county'. :param download_if_missing: bool, if True will attempt to load from remote server prior to generating if file not found locally :return: a pandas DataFrame in FlowByActivity format """ from esupy.processed_data_mgmt import download_from_remote # Set fba metadata name = flowsa.flowbyactivity.set_fba_name(datasource, year) fba_meta = set_fb_meta(name, "FlowByActivity") # Try to load a local version of fba; generate and load if missing fba = load_preprocessed_output(fba_meta, paths) # Remote download if fba is None and download_if_missing: log.info('%s %s not found in %s, downloading from remote source', datasource, str(year), fbaoutputpath) download_from_remote(fba_meta, paths) fba = load_preprocessed_output(fba_meta, paths) if fba is None: log.info('%s %s not found in %s, running functions to generate FBA', datasource, str(year), fbaoutputpath) # Generate the fba flowsa.flowbyactivity.main(year=year, source=datasource) # Now load the fba fba = load_preprocessed_output(fba_meta, paths) if fba is None: log.error('getFlowByActivity failed, FBA not found') else: log.info('Loaded %s %s from %s', datasource, str(year), fbaoutputpath) else: log.info('Loaded %s %s from %s', datasource, str(year), fbaoutputpath) # Address optional parameters if flowclass is not None: fba = fba[fba['Class'] == flowclass] # if geographic level specified, only load rows in geo level if geographic_level is not None: fba = filter_by_geoscale(fba, geographic_level) return fba
def getFlowByActivity(flowclass, years, datasource, geographic_level='all'): """ Retrieves stored data in the FlowByActivity format :param flowclass: list, a list of`Class' of the flow. required. E.g. ['Water'] or ['Land', 'Other'] :param year: list, a list of years [2015], or [2010,2011,2012] :param datasource: str, the code of the datasource. :param geographic_level: default set to 'all', which will load all geographic scales in the FlowByActivity, can \ specify 'national', 'state', 'county' :return: a pandas DataFrame in FlowByActivity format """ fbas = pd.DataFrame() for y in years: # first try reading parquet from your local repo try: log.info('Loading ' + datasource + ' ' + str(y) + ' parquet from local repository') fba = pd.read_parquet(fbaoutputpath + datasource + "_" + str(y) + ".parquet") fba = fba[fba['Class'].isin(flowclass)] fbas = pd.concat([fbas, fba], sort=False) except (OSError, FileNotFoundError): # if parquet does not exist in local repo, read file from Data Commons try: log.info( datasource + ' parquet not found in local repo, loading from Data Commons' ) fba = pd.read_parquet( 'https://edap-ord-data-commons.s3.amazonaws.com/flowsa/FlowByActivity/' + datasource + "_" + str(y) + '.parquet') fba = fba[fba['Class'].isin(flowclass)] fbas = pd.concat([fbas, fba], sort=False) except FileNotFoundError: log.error("No parquet file found for datasource " + datasource + "and year " + str(y) + " in flowsa or Data Commons") # if geographic level specified, only load rows in geo level if geographic_level != 'all': fbas = filter_by_geoscale(fbas, geographic_level) return fbas
def main(method_name): """ Creates a flowbysector dataset :param method_name: Name of method corresponding to flowbysector method yaml name :return: flowbysector """ log.info("Initiating flowbysector creation for " + method_name) # call on method method = load_method(method_name) # create dictionary of data and allocation datasets fb = method['source_names'] # Create empty list for storing fbs files fbs_list = [] for k, v in fb.items(): # pull fba data for allocation flows = load_source_dataframe(k, v) if v['data_format'] == 'FBA': # clean up fba, if specified in yaml if v["clean_fba_df_fxn"] != 'None': log.info("Cleaning up " + k + " FlowByActivity") flows = getattr(sys.modules[__name__], v["clean_fba_df_fxn"])(flows) flows = clean_df(flows, flow_by_activity_fields, fba_fill_na_dict) # if activity_sets are specified in a file, call them here if 'activity_set_file' in v: aset_names = pd.read_csv(flowbysectoractivitysetspath + v['activity_set_file'], dtype=str) # create dictionary of allocation datasets for different activities activities = v['activity_sets'] # subset activity data and allocate to sector for aset, attr in activities.items(): # subset by named activities if 'activity_set_file' in v: names = aset_names[aset_names['activity_set'] == aset]['name'] else: names = attr['names'] log.info("Preparing to handle subset of flownames " + ', '.join(map(str, names)) + " in " + k) # subset fba data by activity flows_subset = flows[ (flows[fba_activity_fields[0]].isin(names)) | (flows[fba_activity_fields[1]].isin(names))].reset_index( drop=True) # extract relevant geoscale data or aggregate existing data log.info("Subsetting/aggregating dataframe to " + attr['allocation_from_scale'] + " geoscale") flows_subset_geo = subset_df_by_geoscale( flows_subset, v['geoscale_to_use'], attr['allocation_from_scale']) # Add sectors to df activity, depending on level of specified sector aggregation log.info("Adding sectors to " + k) flow_subset_wsec = add_sectors_to_flowbyactivity( flows_subset_geo, sectorsourcename=method['target_sector_source']) # clean up fba with sectors, if specified in yaml if v["clean_fba_w_sec_df_fxn"] != 'None': log.info("Cleaning up " + k + " FlowByActivity with sectors") flow_subset_wsec = getattr(sys.modules[__name__], v["clean_fba_w_sec_df_fxn"])( flow_subset_wsec, attr) # map df to elementary flows log.info("Mapping flows in " + k + ' to federal elementary flow list') if 'fedefl_mapping' in v: mapping_files = v['fedefl_mapping'] else: mapping_files = k flow_subset_mapped = map_elementary_flows( flow_subset_wsec, mapping_files) # if allocation method is "direct", then no need to create alloc ratios, else need to use allocation # dataframe to create sector allocation ratios if attr['allocation_method'] == 'direct': log.info('Directly assigning ' + ', '.join(map(str, names)) + ' to sectors') fbs = flow_subset_mapped.copy() else: # determine appropriate allocation dataset log.info("Loading allocation flowbyactivity " + attr['allocation_source'] + " for year " + str(attr['allocation_source_year'])) fba_allocation = flowsa.getFlowByActivity( flowclass=[attr['allocation_source_class']], datasource=attr['allocation_source'], years=[attr['allocation_source_year'] ]).reset_index(drop=True) fba_allocation = clean_df(fba_allocation, flow_by_activity_fields, fba_fill_na_dict) # subset based on yaml settings if attr['allocation_flow'] != 'None': fba_allocation = fba_allocation.loc[ fba_allocation['FlowName'].isin( attr['allocation_flow'])] if attr['allocation_compartment'] != 'None': fba_allocation = fba_allocation.loc[ fba_allocation['Compartment'].isin( attr['allocation_compartment'])] # cleanup the fba allocation df, if necessary if 'clean_allocation_fba' in attr: log.info("Cleaning " + attr['allocation_source']) fba_allocation = getattr(sys.modules[__name__], attr["clean_allocation_fba"])( fba_allocation, attr) # reset index fba_allocation = fba_allocation.reset_index(drop=True) # check if allocation data exists at specified geoscale to use log.info("Checking if allocation data exists at the " + attr['allocation_from_scale'] + " level") check_if_data_exists_at_geoscale( fba_allocation, attr['allocation_from_scale']) # aggregate geographically to the scale of the flowbyactivty source, if necessary from_scale = attr['allocation_from_scale'] to_scale = v['geoscale_to_use'] # if allocation df is less aggregated than FBA df, aggregate allocation df to target scale if fips_number_key[from_scale] > fips_number_key[to_scale]: fba_allocation = agg_by_geoscale( fba_allocation, from_scale, to_scale, fba_default_grouping_fields) # else, if fba is more aggregated than allocation table, use fba as both to and from scale else: fba_allocation = filter_by_geoscale( fba_allocation, from_scale) # assign sector to allocation dataset # todo: add sectorsourcename col value log.info("Adding sectors to " + attr['allocation_source']) fba_allocation_wsec = add_sectors_to_flowbyactivity( fba_allocation, sectorsourcename=method['target_sector_source']) # generalize activity field names to enable link to main fba source log.info("Generalizing activity columns in subset of " + attr['allocation_source']) fba_allocation_wsec = generalize_activity_field_names( fba_allocation_wsec) # call on fxn to further clean up/disaggregate the fba allocation data, if exists if 'clean_allocation_fba_w_sec' in attr: log.info("Further disaggregating sectors in " + attr['allocation_source']) fba_allocation_wsec = getattr( sys.modules[__name__], attr["clean_allocation_fba_w_sec"])( fba_allocation_wsec, attr, method) # subset fba datasets to only keep the sectors associated with activity subset log.info("Subsetting " + attr['allocation_source'] + " for sectors in " + k) fba_allocation_subset = get_fba_allocation_subset( fba_allocation_wsec, k, names) # drop columns fba_allocation_subset = fba_allocation_subset.drop( columns=['Activity']) # if there is an allocation helper dataset, modify allocation df if attr['allocation_helper'] == 'yes': log.info( "Using the specified allocation help for subset of " + attr['allocation_source']) fba_allocation_subset = allocation_helper( fba_allocation_subset, method, attr, v) # create flow allocation ratios for each activity flow_alloc_list = [] for n in names: log.info("Creating allocation ratios for " + n) fba_allocation_subset_2 = get_fba_allocation_subset( fba_allocation_subset, k, [n]) if len(fba_allocation_subset_2) == 0: log.info("No data found to allocate " + n) else: flow_alloc = allocate_by_sector( fba_allocation_subset_2, attr['allocation_method']) flow_alloc = flow_alloc.assign(FBA_Activity=n) flow_alloc_list.append(flow_alloc) flow_allocation = pd.concat(flow_alloc_list) # check for issues with allocation ratios check_allocation_ratios(flow_allocation, aset, k) # create list of sectors in the flow allocation df, drop any rows of data in the flow df that \ # aren't in list sector_list = flow_allocation['Sector'].unique().tolist() # subset fba allocation table to the values in the activity list, based on overlapping sectors flow_subset_mapped = flow_subset_mapped.loc[ (flow_subset_mapped[fbs_activity_fields[0]]. isin(sector_list)) | (flow_subset_mapped[fbs_activity_fields[1]]. isin(sector_list))] # check if fba and allocation dfs have the same LocationSystem log.info( "Checking if flowbyactivity and allocation dataframes use the same location systems" ) check_if_location_systems_match(flow_subset_mapped, flow_allocation) # merge fba df w/flow allocation dataset log.info("Merge " + k + " and subset of " + attr['allocation_source']) fbs = flow_subset_mapped.merge( flow_allocation[[ 'Location', 'Sector', 'FlowAmountRatio', 'FBA_Activity' ]], left_on=[ 'Location', 'SectorProducedBy', 'ActivityProducedBy' ], right_on=['Location', 'Sector', 'FBA_Activity'], how='left') fbs = fbs.merge( flow_allocation[[ 'Location', 'Sector', 'FlowAmountRatio', 'FBA_Activity' ]], left_on=[ 'Location', 'SectorConsumedBy', 'ActivityConsumedBy' ], right_on=['Location', 'Sector', 'FBA_Activity'], how='left') # merge the flowamount columns fbs.loc[:, 'FlowAmountRatio'] = fbs[ 'FlowAmountRatio_x'].fillna(fbs['FlowAmountRatio_y']) # fill null rows with 0 because no allocation info fbs['FlowAmountRatio'] = fbs['FlowAmountRatio'].fillna(0) # check if fba and alloc dfs have data for same geoscales - comment back in after address the 'todo' # log.info("Checking if flowbyactivity and allocation dataframes have data at the same locations") # check_if_data_exists_for_same_geoscales(fbs, k, attr['names']) # drop rows where there is no allocation data fbs = fbs.dropna(subset=['Sector_x', 'Sector_y'], how='all').reset_index() # calculate flow amounts for each sector log.info("Calculating new flow amounts using flow ratios") fbs.loc[:, 'FlowAmount'] = fbs['FlowAmount'] * fbs[ 'FlowAmountRatio'] # drop columns log.info("Cleaning up new flow by sector") fbs = fbs.drop(columns=[ 'Sector_x', 'FlowAmountRatio_x', 'Sector_y', 'FlowAmountRatio_y', 'FlowAmountRatio', 'FBA_Activity_x', 'FBA_Activity_y' ]) # drop rows where flowamount = 0 (although this includes dropping suppressed data) fbs = fbs[fbs['FlowAmount'] != 0].reset_index(drop=True) # clean df fbs = clean_df(fbs, flow_by_sector_fields_w_activity, fbs_fill_na_dict) # aggregate df geographically, if necessary log.info("Aggregating flowbysector to " + method['target_geoscale'] + " level") if fips_number_key[v['geoscale_to_use']] < fips_number_key[ attr['allocation_from_scale']]: from_scale = v['geoscale_to_use'] else: from_scale = attr['allocation_from_scale'] to_scale = method['target_geoscale'] fbs_geo_agg = agg_by_geoscale( fbs, from_scale, to_scale, fbs_grouping_fields_w_activities) # aggregate data to every sector level log.info("Aggregating flowbysector to all sector levels") fbs_sec_agg = sector_aggregation( fbs_geo_agg, fbs_grouping_fields_w_activities) # add missing naics5/6 when only one naics5/6 associated with a naics4 fbs_agg = sector_disaggregation( fbs_sec_agg, flow_by_sector_fields_w_activity) # compare flowbysector with flowbyactivity check_for_differences_between_fba_load_and_fbs_output( flow_subset_mapped, fbs_agg, aset, k) # return sector level specified in method yaml # load the crosswalk linking sector lengths sector_list = get_sector_list(method['target_sector_level']) # subset df, necessary because not all of the sectors are NAICS and can get duplicate rows fbs_1 = fbs_agg.loc[ (fbs_agg[fbs_activity_fields[0]].isin(sector_list)) & (fbs_agg[fbs_activity_fields[1]].isin(sector_list) )].reset_index(drop=True) fbs_2 = fbs_agg.loc[ (fbs_agg[fbs_activity_fields[0]].isin(sector_list)) & (fbs_agg[fbs_activity_fields[1]].isnull())].reset_index( drop=True) fbs_3 = fbs_agg.loc[(fbs_agg[fbs_activity_fields[0]].isnull()) & (fbs_agg[fbs_activity_fields[1]].isin( sector_list))].reset_index(drop=True) fbs_sector_subset = pd.concat([fbs_1, fbs_2, fbs_3]) # check if losing data by subsetting at specified sector length log.info('Checking if losing data by subsetting dataframe') fbs_sector_subset_2 = check_if_losing_sector_data( fbs_agg, fbs_sector_subset, method['target_sector_level']) # set source name fbs_sector_subset_2.loc[:, 'SectorSourceName'] = method[ 'target_sector_source'] # drop activity columns del fbs_sector_subset_2[ 'ActivityProducedBy'], fbs_sector_subset_2[ 'ActivityConsumedBy'] log.info( "Completed flowbysector for activity subset with flows " + ', '.join(map(str, names))) fbs_list.append(fbs_sector_subset_2) else: # if the loaded flow dt is already in FBS format, append directly to list of FBS log.info("Append " + k + " to FBS list") fbs_list.append(flows) # create single df of all activities log.info("Concat data for all activities") fbss = pd.concat(fbs_list, ignore_index=True, sort=False) log.info("Clean final dataframe") # aggregate df as activities might have data for the same specified sector length fbss = clean_df(fbss, flow_by_sector_fields, fbs_fill_na_dict) fbss = aggregator(fbss, fbs_default_grouping_fields) # sort df log.info("Sort and store dataframe") # add missing fields, ensure correct data type, reorder columns fbss = fbss.sort_values( ['SectorProducedBy', 'SectorConsumedBy', 'Flowable', 'Context']).reset_index(drop=True) # save parquet file store_flowbysector(fbss, method_name)
def main(method_name): """ Creates a flowbysector dataset :param method_name: Name of method corresponding to flowbysector method yaml name :return: flowbysector """ log.info("Initiating flowbysector creation for " + method_name) # call on method method = load_method(method_name) # create dictionary of water data and allocation datasets fbas = method['flowbyactivity_sources'] # Create empty list for storing fbs files fbss = [] for k, v in fbas.items(): # pull water data for allocation log.info("Retrieving flowbyactivity for datasource " + k + " in year " + str(v['year'])) flows = flowsa.getFlowByActivity(flowclass=[v['class']], years=[v['year']], datasource=k) # if necessary, standardize names in data set if v['activity_name_standardization_fxn'] != 'None': log.info("Standardizing activity names in " + k) flows = getattr(sys.modules[__name__], v['activity_name_standardization_fxn'])(flows) # drop description field flows = flows.drop(columns='Description') # fill null values flows = flows.fillna(value=fba_fill_na_dict) # map df to elementary flows - commented out until mapping complete # log.info("Mapping flows in " + k + ' to federal elementary flow list') # flows_mapped = map_elementary_flows(flows, k) # convert unit todo: think about unit conversion here log.info("Converting units in " + k) flows = convert_unit(flows) # create dictionary of allocation datasets for different activities activities = v['activity_sets'] for aset, attr in activities.items(): # subset by named activities names = [attr['names']] log.info("Preparing to handle subset of flownames " + ', '.join(map(str, names)) + " in " + k) # subset usgs data by activity flow_subset = flows[(flows[fba_activity_fields[0]].isin(names)) | (flows[fba_activity_fields[1]].isin(names))] # Reset index values after subset flow_subset = flow_subset.reset_index(drop=True) # check if flowbyactivity data exists at specified geoscale to use log.info("Checking if flowbyactivity data exists for " + ', '.join(map(str, names)) + " at the " + v['geoscale_to_use'] + ' level') geocheck = check_if_data_exists_at_geoscale(flow_subset, names, v['geoscale_to_use']) # aggregate geographically to the scale of the allocation dataset if geocheck == "Yes": activity_from_scale = v['geoscale_to_use'] else: # if activity does not exist at specified geoscale, issue warning and use data at less aggregated # geoscale, and sum to specified geoscale log.info("Checking if flowbyactivity data exists for " + ', '.join(map(str, names)) + " at a less aggregated level") new_geoscale_to_use = check_if_data_exists_at_less_aggregated_geoscale(flow_subset, names, v['geoscale_to_use']) activity_from_scale = new_geoscale_to_use activity_to_scale = attr['allocation_from_scale'] # if usgs is less aggregated than allocation df, aggregate usgs activity to target scale if fips_number_key[activity_from_scale] > fips_number_key[activity_to_scale]: log.info("Aggregating subset from " + activity_from_scale + " to " + activity_to_scale) flow_subset = agg_by_geoscale(flow_subset, activity_from_scale, activity_to_scale, fba_default_grouping_fields, names) # else, aggregate to geoscale want to use elif fips_number_key[activity_from_scale] > fips_number_key[v['geoscale_to_use']]: log.info("Aggregating subset from " + activity_from_scale + " to " + v['geoscale_to_use']) flow_subset = agg_by_geoscale(flow_subset, activity_from_scale, v['geoscale_to_use'], fba_default_grouping_fields, names) # else, if usgs is more aggregated than allocation table, filter relevant rows else: log.info("Filtering out " + activity_from_scale + " data") flow_subset = filter_by_geoscale(flow_subset, activity_from_scale, names) # location column pad zeros if necessary flow_subset['Location'] = flow_subset['Location'].apply(lambda x: x.ljust(3 + len(x), '0') if len(x) < 5 else x ) # Add sectors to usgs activity, creating two versions of the flow subset # the first version "flow_subset" is the most disaggregated version of the Sectors (NAICS) # the second version, "flow_subset_agg" includes only the most aggregated level of sectors log.info("Adding sectors to " + k + " for " + ', '.join(map(str, names))) flow_subset_wsec = add_sectors_to_flowbyactivity(flow_subset, sectorsourcename=method['target_sector_source']) flow_subset_wsec_agg = add_sectors_to_flowbyactivity(flow_subset, sectorsourcename=method['target_sector_source'], levelofSectoragg='agg') # if allocation method is "direct", then no need to create alloc ratios, else need to use allocation # dataframe to create sector allocation ratios if attr['allocation_method'] == 'direct': fbs = flow_subset_wsec_agg.copy() else: # determine appropriate allocation dataset log.info("Loading allocation flowbyactivity " + attr['allocation_source'] + " for year " + str(attr['allocation_source_year'])) fba_allocation = flowsa.getFlowByActivity(flowclass=[attr['allocation_source_class']], datasource=attr['allocation_source'], years=[attr['allocation_source_year']]).reset_index(drop=True) # fill null values fba_allocation = fba_allocation.fillna(value=fba_fill_na_dict) # convert unit fba_allocation = convert_unit(fba_allocation) # subset based on yaml settings if attr['allocation_flow'] != 'None': fba_allocation = fba_allocation.loc[fba_allocation['FlowName'].isin(attr['allocation_flow'])] if attr['allocation_compartment'] != 'None': fba_allocation = fba_allocation.loc[ fba_allocation['Compartment'].isin(attr['allocation_compartment'])] # reset index fba_allocation = fba_allocation.reset_index(drop=True) # check if allocation data exists at specified geoscale to use log.info("Checking if" + " allocation data exists for " + ', '.join(map(str, names)) + " at the " + attr['allocation_from_scale'] + " level") check_if_data_exists_at_geoscale(fba_allocation, names, attr['allocation_from_scale']) # aggregate geographically to the scale of the flowbyactivty source, if necessary from_scale = attr['allocation_from_scale'] to_scale = v['geoscale_to_use'] # if allocation df is less aggregated than FBA df, aggregate allocation df to target scale if fips_number_key[from_scale] > fips_number_key[to_scale]: fba_allocation = agg_by_geoscale(fba_allocation, from_scale, to_scale, fba_default_grouping_fields, names) # else, if usgs is more aggregated than allocation table, use usgs as both to and from scale else: fba_allocation = filter_by_geoscale(fba_allocation, from_scale, names) # assign sector to allocation dataset log.info("Adding sectors to " + attr['allocation_source']) fba_allocation = add_sectors_to_flowbyactivity(fba_allocation, sectorsourcename=method['target_sector_source'], levelofSectoragg=attr[ 'allocation_sector_aggregation']) # subset fba datsets to only keep the naics associated with usgs activity subset log.info("Subsetting " + attr['allocation_source'] + " for sectors in " + k) fba_allocation_subset = get_fba_allocation_subset(fba_allocation, k, names) # Reset index values after subset fba_allocation_subset = fba_allocation_subset.reset_index(drop=True) # generalize activity field names to enable link to water withdrawal table log.info("Generalizing activity names in subset of " + attr['allocation_source']) fba_allocation_subset = generalize_activity_field_names(fba_allocation_subset) # drop columns fba_allocation_subset = fba_allocation_subset.drop(columns=['Activity']) # if there is an allocation helper dataset, modify allocation df if attr['allocation_helper'] == 'yes': log.info("Using the specified allocation help for subset of " + attr['allocation_source']) fba_allocation_subset = allocation_helper(fba_allocation_subset, method, attr) # create flow allocation ratios log.info("Creating allocation ratios for " + attr['allocation_source']) flow_allocation = allocate_by_sector(fba_allocation_subset, attr['allocation_method']) # create list of sectors in the flow allocation df, drop any rows of data in the flow df that \ # aren't in list sector_list = flow_allocation['Sector'].unique().tolist() # subset fba allocation table to the values in the activity list, based on overlapping sectors flow_subset_wsec = flow_subset_wsec.loc[ (flow_subset_wsec[fbs_activity_fields[0]].isin(sector_list)) | (flow_subset_wsec[fbs_activity_fields[1]].isin(sector_list))] # check if fba and allocation dfs have the same LocationSystem log.info("Checking if flowbyactivity and allocation dataframes use the same location systems") check_if_location_systems_match(flow_subset_wsec, flow_allocation) # merge water withdrawal df w/flow allocation dataset log.info("Merge " + k + " and subset of " + attr['allocation_source']) fbs = flow_subset_wsec.merge( flow_allocation[['Location', 'LocationSystem', 'Sector', 'FlowAmountRatio']], left_on=['Location', 'LocationSystem', 'SectorProducedBy'], right_on=['Location', 'LocationSystem', 'Sector'], how='left') fbs = fbs.merge( flow_allocation[['Location', 'LocationSystem', 'Sector', 'FlowAmountRatio']], left_on=['Location', 'LocationSystem', 'SectorConsumedBy'], right_on=['Location', 'LocationSystem', 'Sector'], how='left') # drop columns where both sector produced/consumed by in flow allocation dif is null fbs = fbs.dropna(subset=['Sector_x', 'Sector_y'], how='all').reset_index() # merge the flowamount columns fbs['FlowAmountRatio'] = fbs['FlowAmountRatio_x'].fillna(fbs['FlowAmountRatio_y']) fbs['FlowAmountRatio'] = fbs['FlowAmountRatio'].fillna(0) # calculate flow amounts for each sector log.info("Calculating new flow amounts using flow ratios") fbs['FlowAmount'] = fbs['FlowAmount'] * fbs['FlowAmountRatio'] # drop columns log.info("Cleaning up new flow by sector") fbs = fbs.drop(columns=['Sector_x', 'FlowAmountRatio_x', 'Sector_y', 'FlowAmountRatio_y', 'FlowAmountRatio', 'ActivityProducedBy', 'ActivityConsumedBy']) # rename flow name to flowable fbs = fbs.rename(columns={"FlowName": 'Flowable', "Compartment": "Context" }) # drop rows where flowamount = 0 (although this includes dropping suppressed data) fbs = fbs[fbs['FlowAmount'] != 0].reset_index(drop=True) # add missing data columns fbs = add_missing_flow_by_fields(fbs, flow_by_sector_fields) # fill null values fbs = fbs.fillna(value=fbs_fill_na_dict) # aggregate df geographically, if necessary log.info("Aggregating flowbysector to " + method['target_geoscale'] + " level") if fips_number_key[v['geoscale_to_use']] < fips_number_key[attr['allocation_from_scale']]: from_scale = v['geoscale_to_use'] else: from_scale = attr['allocation_from_scale'] to_scale = method['target_geoscale'] fbs = agg_by_geoscale(fbs, from_scale, to_scale, fbs_default_grouping_fields, names) # aggregate data to every sector level log.info("Aggregating flowbysector to " + method['target_sector_level']) fbs = sector_aggregation(fbs, fbs_default_grouping_fields) # test agg by sector sector_agg_comparison = sector_flow_comparision(fbs) # return sector level specified in method yaml # load the crosswalk linking sector lengths cw = load_sector_length_crosswalk() sector_list = cw[method['target_sector_level']].unique().tolist() # add any non-NAICS sectors used with NAICS household = load_household_sector_codes() household = household.loc[household['NAICS_Level_to_Use_For'] == method['target_sector_level']] # add household sector to sector list sector_list.extend(household['Code'].tolist()) # subset df fbs = fbs.loc[(fbs[fbs_activity_fields[0]].isin(sector_list)) | (fbs[fbs_activity_fields[1]].isin(sector_list))].reset_index(drop=True) # add any missing columns of data and cast to appropriate data type fbs = add_missing_flow_by_fields(fbs, flow_by_sector_fields) log.info("Completed flowbysector for activity subset with flows " + ', '.join(map(str, names))) fbss.append(fbs) # create single df of all activities fbss = pd.concat(fbss, ignore_index=True, sort=False) # aggregate df as activities might have data for the same specified sector length fbss = aggregator(fbss, fbs_default_grouping_fields) # sort df fbss = fbss.sort_values( ['SectorProducedBy', 'SectorConsumedBy', 'Flowable', 'Context']).reset_index(drop=True) # save parquet file store_flowbysector(fbss, method_name)
def geoscale_flow_comparison(flowclass, years, datasource, activitynames=['all'], to_scale='national'): """ Aggregates county data to state and national, and state data to national level, allowing for comparisons in flow totals for a given flowclass and industry. First assigns all flownames to NAICS and standardizes units. Assigned to NAICS rather than using FlowNames for aggregation to negate any changes in flownames across time/geoscale """ # load parquet file checking aggregation flows = flowsa.getFlowByActivity(flowclass=flowclass, years=years, datasource=datasource) # fill null values flows = flows.fillna(value=fba_fill_na_dict) # convert units flows = convert_unit(flows) # if activityname set to default, then compare aggregation for all activities. If looking at particular activity, # filter that activity out if activitynames == ['all']: flow_subset = flows.copy() else: flow_subset = flows[ (flows[fba_activity_fields[0]].isin(activitynames)) | (flows[fba_activity_fields[1]].isin(activitynames))] # Reset index values after subset flow_subset = flow_subset.reset_index() # pull naics crosswalk mapping = get_activitytosector_mapping(flow_subset['SourceName'].all()) # assign naics to activities # usgs datasource is not easily assigned to naics for checking totals, so instead standardize activity names if datasource == 'USGS_NWIS_WU': flow_subset = standardize_usgs_nwis_names(flow_subset) else: flow_subset = pd.merge(flow_subset, mapping[['Activity', 'Sector']], left_on='ActivityProducedBy', right_on='Activity', how='left').rename( {'Sector': 'SectorProducedBy'}, axis=1) flow_subset = pd.merge(flow_subset, mapping[['Activity', 'Sector']], left_on='ActivityConsumedBy', right_on='Activity', how='left').rename( {'Sector': 'SectorConsumedBy'}, axis=1) flow_subset = flow_subset.drop(columns=[ 'ActivityProducedBy', 'ActivityConsumedBy', 'Activity_x', 'Activity_y', 'Description' ], errors='ignore') flow_subset['SectorProducedBy'] = flow_subset['SectorProducedBy'].replace({ np.nan: None }).astype(str) flow_subset['SectorConsumedBy'] = flow_subset['SectorConsumedBy'].replace({ np.nan: None }).astype(str) # create list of geoscales for aggregation if to_scale == 'national': geoscales = ['national', 'state', 'county'] elif to_scale == 'state': geoscales = ['state', 'county'] # create empty df list flow_dfs = [] for i in geoscales: try: # filter by geoscale fba_from_scale = filter_by_geoscale(flow_subset, i) # remove/add column names as a column group_cols = fba_default_grouping_fields.copy() for j in ['Location', 'ActivityProducedBy', 'ActivityConsumedBy']: group_cols.remove(j) for j in ['SectorProducedBy', 'SectorConsumedBy']: group_cols.append(j) # county sums to state and national, state sums to national if to_scale == 'state': fba_from_scale['Location'] = fba_from_scale['Location'].apply( lambda x: str(x[0:2])) elif to_scale == 'national': fba_from_scale['Location'] = US_FIPS # aggregate fba_agg = aggregator(fba_from_scale, group_cols) # rename flowamount column, based on geoscale fba_agg = fba_agg.rename(columns={"FlowAmount": "FlowAmount_" + i}) # drop fields irrelevant to aggregated flow comparision drop_fields = flows[[ 'MeasureofSpread', 'Spread', 'DistributionType', 'DataReliability', 'DataCollection' ]] fba_agg = fba_agg.drop(columns=drop_fields) # reset index fba_agg = fba_agg.reset_index(drop=True) flow_dfs.append(fba_agg) except: pass # merge list of dfs by column flow_comparison = reduce( lambda left, right: pd.merge( left, right, on=[ 'Class', 'SourceName', 'FlowName', 'Unit', 'SectorProducedBy', 'SectorConsumedBy', 'Compartment', 'Location', 'LocationSystem', 'Year' ], how='outer'), flow_dfs) # sort df flow_comparison = flow_comparison.sort_values([ 'Year', 'Location', 'SectorProducedBy', 'SectorConsumedBy', 'FlowName', 'Compartment' ]) return flow_comparison
def convert_blackhurst_data_to_gal_per_employee(df_wsec, attr, method): """ :param df_wsec: :param attr: :param method: :return: """ import flowsa from flowsa.mapping import add_sectors_to_flowbyactivity from flowsa.flowbyfunctions import clean_df, fba_fill_na_dict, proportional_allocation_by_location_and_activity, \ filter_by_geoscale, harmonize_units from flowsa.BLS_QCEW import clean_bls_qcew_fba bls = flowsa.getFlowByActivity(flowclass=['Employment'], datasource='BLS_QCEW', years=[2002]) bls = filter_by_geoscale(bls, 'national') # clean df bls = clean_df(bls, flow_by_activity_fields, fba_fill_na_dict) bls = harmonize_units(bls) bls = clean_bls_qcew_fba(bls, attr=attr) # assign naics to allocation dataset bls_wsec = add_sectors_to_flowbyactivity( bls, sectorsourcename=method['target_sector_source']) # drop rows where sector = None ( does not occur with mining) bls_wsec = bls_wsec[~bls_wsec['SectorProducedBy'].isnull()] bls_wsec = bls_wsec.rename(columns={ 'SectorProducedBy': 'Sector', 'FlowAmount': 'HelperFlow' }) # merge the two dfs df = pd.merge(df_wsec, bls_wsec[['Location', 'Sector', 'HelperFlow']], how='left', left_on=['Location', 'SectorConsumedBy'], right_on=['Location', 'Sector']) # drop any rows where sector is None df = df[~df['Sector'].isnull()] # fill helperflow values with 0 df['HelperFlow'] = df['HelperFlow'].fillna(0) # calculate proportional ratios df_wratio = proportional_allocation_by_location_and_activity(df, 'Sector') df_wratio = df_wratio.rename(columns={ 'FlowAmountRatio': 'EmployeeRatio', 'HelperFlow': 'Employees' }) # drop rows where helperflow = 0 df_wratio = df_wratio[df_wratio['Employees'] != 0] # calculate gal/employee in 2002 df_wratio.loc[:, 'FlowAmount'] = ( df_wratio['FlowAmount'] * df_wratio['EmployeeRatio']) / df_wratio['Employees'] df_wratio.loc[:, 'Unit'] = 'gal/employee' # drop cols df_wratio = df_wratio.drop( columns=['Sector', 'Employees', 'EmployeeRatio']) return df_wratio