def store_test_flowbyactivity(csvname, year=None): """Prints the data frame into a parquet file.""" if year is not None: f = fbaoutputpath + csvname + "_" + str(year) + '.parquet' else: f = fbaoutputpath + csvname + '.parquet' try: result = pd.read_csv(datapath + csvname + '.csv', dtype='str') result = add_missing_flow_by_fields(result, flow_by_activity_fields) result.to_parquet(f, engine="pyarrow") except: log.error('Failed to save ' + csvname + "_" + str(year) + ' file.')
def prepare_stewi_fbs(df, inventory_dict, NAICS_level, geo_scale): from stewi.globals import weighted_average # update location to appropriate geoscale prior to aggregating df.dropna(subset=['Location'], inplace=True) df['Location'] = df['Location'].astype(str) df = update_geoscale(df, geo_scale) # assign grouping variables based on desired geographic aggregation level grouping_vars = ['NAICS_lvl', 'FlowName', 'Compartment', 'Location'] if 'MetaSources' in df: grouping_vars.append('MetaSources') # aggregate by NAICS code, FlowName, compartment, and geographic level fbs = df.groupby(grouping_vars).agg({ 'FlowAmount': 'sum', 'Year': 'first', 'Unit': 'first' }) # add reliability score fbs['DataReliability'] = weighted_average(df, 'DataReliability', 'FlowAmount', grouping_vars) fbs.reset_index(inplace=True) # apply flow mapping fbs = map_elementary_flows(fbs, list(inventory_dict.keys())) # rename columns to match flowbysector format fbs = fbs.rename(columns={"NAICS_lvl": "SectorProducedBy"}) # add hardcoded data, depending on the source data, some of these fields may need to change fbs['Class'] = 'Chemicals' fbs['SectorConsumedBy'] = 'None' fbs['SectorSourceName'] = 'NAICS_2012_Code' fbs['FlowType'] = 'ELEMENTARY_FLOW' fbs = assign_fips_location_system(fbs, list(inventory_dict.values())[0]) # add missing flow by sector fields fbs = add_missing_flow_by_fields(fbs, flow_by_sector_fields) fbs = check_for_missing_sector_data(fbs, NAICS_level) # sort dataframe and reset index fbs = fbs.sort_values(list( flow_by_sector_fields.keys())).reset_index(drop=True) # check the sector codes to make sure NAICS 2012 codes fbs = replace_naics_w_naics_2012(fbs, 'NAICS_2012_Code') return fbs
def bls_clean_allocation_fba_w_sec(df_w_sec, **kwargs): """ clean up bls df with sectors by estimating suppresed data :param df_w_sec: :param attr: :param method: :return: """ from flowsa.flowbyfunctions import estimate_suppressed_data, sector_disaggregation, sector_aggregation, \ flow_by_activity_wsec_mapped_fields, add_missing_flow_by_fields, replace_strings_with_NoneType sector_column = 'SectorProducedBy' df = estimate_suppressed_data(df_w_sec, sector_column) df = add_missing_flow_by_fields(df, flow_by_activity_wsec_mapped_fields) df = replace_strings_with_NoneType(df) # df = sector_aggregation(df, fba_mapped_default_grouping_fields) # df = sector_disaggregation(df, fba_mapped_default_grouping_fields) return df
return df if __name__ == '__main__': # assign arguments args = parse_args() # assign yaml parameters (common.py fxn) config = load_sourceconfig(args['source']) # build the base url with strings that will be replaced build_url = build_url_for_query(config['url']) # replace parts of urls with specific instructions from source.py urls = assemble_urls_for_query(build_url, config, args) # create a list with data from all source urls dataframe_list = call_urls(urls, args) # concat the dataframes and parse data with specific instructions from source.py df = parse_data(dataframe_list, args) # log that data was retrieved log.info("Retrieved data for " + args['source']) # add any missing columns of data and cast to appropriate data type flow_df = add_missing_flow_by_fields(df, flow_by_activity_fields) # modify flow units flow_df = convert_fba_unit(flow_df) # sort df and reset index flow_df = flow_df.sort_values([ 'Class', 'Location', 'ActivityProducedBy', 'ActivityConsumedBy', 'FlowName', 'Compartment' ]).reset_index(drop=True) # save as parquet file parquet_name = args['source'] + '_' + args['year'] store_flowbyactivity(flow_df, parquet_name)
def disaggregate_cropland(fba_w_sector, attr): """ In the event there are 4 (or 5) digit naics for cropland at the county level, use state level harvested cropland to create ratios :param fba_w_sector: :param attr: :return: """ import flowsa from flowsa.flowbyfunctions import generalize_activity_field_names, sector_aggregation,\ fbs_default_grouping_fields, clean_df, fba_fill_na_dict, add_missing_flow_by_fields from flowsa.mapping import add_sectors_to_flowbyactivity # drop pastureland data crop = fba_w_sector.loc[fba_w_sector['Sector'].apply(lambda x: str(x[0:3])) != '112'].reset_index(drop=True) # drop sectors < 4 digits crop = crop[crop['Sector'].apply(lambda x: len(x) > 3)].reset_index( drop=True) # create tmp location crop.loc[:, 'Location_tmp'] = crop['Location'].apply(lambda x: str(x[0:2])) # load the relevant state level harvested cropland by naics naics_load = flowsa.getFlowByActivity( flowclass=['Land'], years=[attr['allocation_source_year']], datasource="USDA_CoA_Cropland_NAICS").reset_index(drop=True) # clean df naics = clean_df(naics_load, flow_by_activity_fields, fba_fill_na_dict) # subset the harvested cropland by naics naics = naics[naics['FlowName'] == 'AG LAND, CROPLAND, HARVESTED'].reset_index(drop=True) # add sectors naics = add_sectors_to_flowbyactivity(naics, sectorsourcename='NAICS_2012_Code', levelofSectoragg='agg') # add missing fbs fields naics = add_missing_flow_by_fields(naics, flow_by_sector_fields) # aggregate sectors to create any missing naics levels naics = sector_aggregation(naics, fbs_default_grouping_fields) # add missing naics5/6 when only one naics5/6 associated with a naics4 naics = sector_disaggregation(naics) # drop rows where sector consumed by is none and FlowAmount 0 naics = naics[naics['SectorConsumedBy'].notnull()] naics = naics.loc[naics['FlowAmount'] != 0] # create ratios naics = sector_ratios(naics) # drop sectors < 4 digits #naics = naics[naics['SectorConsumedBy'].apply(lambda x: len(x) > 3)].reset_index(drop=True) # create temporary sector column to match the two dfs on naics.loc[:, 'Location_tmp'] = naics['Location'].apply(lambda x: str(x[0:2])) # for loop through naics lengths to determine naics 4 and 5 digits to disaggregate for i in range(4, 6): # subset df to sectors with length = i and length = i + 1 crop_subset = crop.loc[crop['Sector'].apply( lambda x: i + 1 >= len(x) >= i)] crop_subset.loc[:, 'Sector_tmp'] = crop_subset['Sector'].apply( lambda x: x[0:i]) # if duplicates drop all rows df = crop_subset.drop_duplicates(subset=['Location', 'Sector_tmp'], keep=False).reset_index(drop=True) # drop sector temp column df = df.drop(columns=["Sector_tmp"]) # subset df to keep the sectors of length i df_subset = df.loc[df['Sector'].apply(lambda x: len(x) == i)] # subset the naics df where naics length is i + 1 naics_subset = naics.loc[naics['SectorConsumedBy'].apply( lambda x: len(x) == i + 1)].reset_index(drop=True) naics_subset.loc[:, 'Sector_tmp'] = naics_subset[ 'SectorConsumedBy'].apply(lambda x: x[0:i]) # merge the two df based on locations df_subset = pd.merge(df_subset, naics_subset[[ 'SectorConsumedBy', 'FlowAmountRatio', 'Sector_tmp', 'Location_tmp' ]], how='left', left_on=['Sector', 'Location_tmp'], right_on=['Sector_tmp', 'Location_tmp']) # create flow amounts for the new NAICS based on the flow ratio df_subset.loc[:, 'FlowAmount'] = df_subset['FlowAmount'] * df_subset[ 'FlowAmountRatio'] # drop rows of 0 and na df_subset = df_subset[df_subset['FlowAmount'] != 0] df_subset = df_subset[~df_subset['FlowAmount'].isna()].reset_index( drop=True) # drop columns df_subset = df_subset.drop( columns=['Sector', 'FlowAmountRatio', 'Sector_tmp']) # rename columns df_subset = df_subset.rename(columns={"SectorConsumedBy": "Sector"}) # add new rows of data to crop df crop = pd.concat([crop, df_subset], sort=True).reset_index(drop=True) # clean up df crop = crop.drop(columns=['Location_tmp']) # pasture data pasture = fba_w_sector.loc[fba_w_sector['Sector'].apply( lambda x: str(x[0:3])) == '112'].reset_index(drop=True) # concat crop and pasture fba_w_sector = pd.concat([pasture, crop], sort=True).reset_index(drop=True) return fba_w_sector
data["ActivityProducedBy"] = produced_by(entry) data["ActivityConsumedBy"] = None dataframe = pd.read_csv(external_datapath + "/" + entry, header=0, dtype=str) for col in dataframe.columns: if "Percent" in str(col): del dataframe[col] for index, row in dataframe.iterrows(): for column in dataframe.columns: if "Material" != column: col_string = column.split() data["Unit"] = col_string[1] data['FlowName'] = dataframe.iloc[index][ "Material"] + " " + col_string[0] if dataframe.iloc[index][column] != "-": data["FlowAmount"] = int( dataframe.iloc[index][column]) output = output.append(data, ignore_index=True) output = assign_fips_location_system( output, '2014') flow_df = add_missing_flow_by_fields(output, flow_by_activity_fields) # add missing dataframe fields (also converts columns to desired datatype) flow_df = clean_df(flow_df, flow_by_activity_fields, fba_fill_na_dict, drop_description=False) parquet_name = 'California_Commercial_bySector' store_flowbyactivity(flow_df, parquet_name, 2014)
def main(method_name): """ Creates a flowbysector dataset :param method_name: Name of method corresponding to flowbysector method yaml name :return: flowbysector """ log.info("Initiating flowbysector creation for " + method_name) # call on method method = load_method(method_name) # create dictionary of water data and allocation datasets fbas = method['flowbyactivity_sources'] # Create empty list for storing fbs files fbss = [] for k, v in fbas.items(): # pull water data for allocation log.info("Retrieving flowbyactivity for datasource " + k + " in year " + str(v['year'])) flows = flowsa.getFlowByActivity(flowclass=[v['class']], years=[v['year']], datasource=k) # if necessary, standardize names in data set if v['activity_name_standardization_fxn'] != 'None': log.info("Standardizing activity names in " + k) flows = getattr(sys.modules[__name__], v['activity_name_standardization_fxn'])(flows) # drop description field flows = flows.drop(columns='Description') # fill null values flows = flows.fillna(value=fba_fill_na_dict) # map df to elementary flows - commented out until mapping complete # log.info("Mapping flows in " + k + ' to federal elementary flow list') # flows_mapped = map_elementary_flows(flows, k) # convert unit todo: think about unit conversion here log.info("Converting units in " + k) flows = convert_unit(flows) # create dictionary of allocation datasets for different activities activities = v['activity_sets'] for aset, attr in activities.items(): # subset by named activities names = [attr['names']] log.info("Preparing to handle subset of flownames " + ', '.join(map(str, names)) + " in " + k) # subset usgs data by activity flow_subset = flows[(flows[fba_activity_fields[0]].isin(names)) | (flows[fba_activity_fields[1]].isin(names))] # Reset index values after subset flow_subset = flow_subset.reset_index(drop=True) # check if flowbyactivity data exists at specified geoscale to use log.info("Checking if flowbyactivity data exists for " + ', '.join(map(str, names)) + " at the " + v['geoscale_to_use'] + ' level') geocheck = check_if_data_exists_at_geoscale(flow_subset, names, v['geoscale_to_use']) # aggregate geographically to the scale of the allocation dataset if geocheck == "Yes": activity_from_scale = v['geoscale_to_use'] else: # if activity does not exist at specified geoscale, issue warning and use data at less aggregated # geoscale, and sum to specified geoscale log.info("Checking if flowbyactivity data exists for " + ', '.join(map(str, names)) + " at a less aggregated level") new_geoscale_to_use = check_if_data_exists_at_less_aggregated_geoscale(flow_subset, names, v['geoscale_to_use']) activity_from_scale = new_geoscale_to_use activity_to_scale = attr['allocation_from_scale'] # if usgs is less aggregated than allocation df, aggregate usgs activity to target scale if fips_number_key[activity_from_scale] > fips_number_key[activity_to_scale]: log.info("Aggregating subset from " + activity_from_scale + " to " + activity_to_scale) flow_subset = agg_by_geoscale(flow_subset, activity_from_scale, activity_to_scale, fba_default_grouping_fields, names) # else, aggregate to geoscale want to use elif fips_number_key[activity_from_scale] > fips_number_key[v['geoscale_to_use']]: log.info("Aggregating subset from " + activity_from_scale + " to " + v['geoscale_to_use']) flow_subset = agg_by_geoscale(flow_subset, activity_from_scale, v['geoscale_to_use'], fba_default_grouping_fields, names) # else, if usgs is more aggregated than allocation table, filter relevant rows else: log.info("Filtering out " + activity_from_scale + " data") flow_subset = filter_by_geoscale(flow_subset, activity_from_scale, names) # location column pad zeros if necessary flow_subset['Location'] = flow_subset['Location'].apply(lambda x: x.ljust(3 + len(x), '0') if len(x) < 5 else x ) # Add sectors to usgs activity, creating two versions of the flow subset # the first version "flow_subset" is the most disaggregated version of the Sectors (NAICS) # the second version, "flow_subset_agg" includes only the most aggregated level of sectors log.info("Adding sectors to " + k + " for " + ', '.join(map(str, names))) flow_subset_wsec = add_sectors_to_flowbyactivity(flow_subset, sectorsourcename=method['target_sector_source']) flow_subset_wsec_agg = add_sectors_to_flowbyactivity(flow_subset, sectorsourcename=method['target_sector_source'], levelofSectoragg='agg') # if allocation method is "direct", then no need to create alloc ratios, else need to use allocation # dataframe to create sector allocation ratios if attr['allocation_method'] == 'direct': fbs = flow_subset_wsec_agg.copy() else: # determine appropriate allocation dataset log.info("Loading allocation flowbyactivity " + attr['allocation_source'] + " for year " + str(attr['allocation_source_year'])) fba_allocation = flowsa.getFlowByActivity(flowclass=[attr['allocation_source_class']], datasource=attr['allocation_source'], years=[attr['allocation_source_year']]).reset_index(drop=True) # fill null values fba_allocation = fba_allocation.fillna(value=fba_fill_na_dict) # convert unit fba_allocation = convert_unit(fba_allocation) # subset based on yaml settings if attr['allocation_flow'] != 'None': fba_allocation = fba_allocation.loc[fba_allocation['FlowName'].isin(attr['allocation_flow'])] if attr['allocation_compartment'] != 'None': fba_allocation = fba_allocation.loc[ fba_allocation['Compartment'].isin(attr['allocation_compartment'])] # reset index fba_allocation = fba_allocation.reset_index(drop=True) # check if allocation data exists at specified geoscale to use log.info("Checking if" + " allocation data exists for " + ', '.join(map(str, names)) + " at the " + attr['allocation_from_scale'] + " level") check_if_data_exists_at_geoscale(fba_allocation, names, attr['allocation_from_scale']) # aggregate geographically to the scale of the flowbyactivty source, if necessary from_scale = attr['allocation_from_scale'] to_scale = v['geoscale_to_use'] # if allocation df is less aggregated than FBA df, aggregate allocation df to target scale if fips_number_key[from_scale] > fips_number_key[to_scale]: fba_allocation = agg_by_geoscale(fba_allocation, from_scale, to_scale, fba_default_grouping_fields, names) # else, if usgs is more aggregated than allocation table, use usgs as both to and from scale else: fba_allocation = filter_by_geoscale(fba_allocation, from_scale, names) # assign sector to allocation dataset log.info("Adding sectors to " + attr['allocation_source']) fba_allocation = add_sectors_to_flowbyactivity(fba_allocation, sectorsourcename=method['target_sector_source'], levelofSectoragg=attr[ 'allocation_sector_aggregation']) # subset fba datsets to only keep the naics associated with usgs activity subset log.info("Subsetting " + attr['allocation_source'] + " for sectors in " + k) fba_allocation_subset = get_fba_allocation_subset(fba_allocation, k, names) # Reset index values after subset fba_allocation_subset = fba_allocation_subset.reset_index(drop=True) # generalize activity field names to enable link to water withdrawal table log.info("Generalizing activity names in subset of " + attr['allocation_source']) fba_allocation_subset = generalize_activity_field_names(fba_allocation_subset) # drop columns fba_allocation_subset = fba_allocation_subset.drop(columns=['Activity']) # if there is an allocation helper dataset, modify allocation df if attr['allocation_helper'] == 'yes': log.info("Using the specified allocation help for subset of " + attr['allocation_source']) fba_allocation_subset = allocation_helper(fba_allocation_subset, method, attr) # create flow allocation ratios log.info("Creating allocation ratios for " + attr['allocation_source']) flow_allocation = allocate_by_sector(fba_allocation_subset, attr['allocation_method']) # create list of sectors in the flow allocation df, drop any rows of data in the flow df that \ # aren't in list sector_list = flow_allocation['Sector'].unique().tolist() # subset fba allocation table to the values in the activity list, based on overlapping sectors flow_subset_wsec = flow_subset_wsec.loc[ (flow_subset_wsec[fbs_activity_fields[0]].isin(sector_list)) | (flow_subset_wsec[fbs_activity_fields[1]].isin(sector_list))] # check if fba and allocation dfs have the same LocationSystem log.info("Checking if flowbyactivity and allocation dataframes use the same location systems") check_if_location_systems_match(flow_subset_wsec, flow_allocation) # merge water withdrawal df w/flow allocation dataset log.info("Merge " + k + " and subset of " + attr['allocation_source']) fbs = flow_subset_wsec.merge( flow_allocation[['Location', 'LocationSystem', 'Sector', 'FlowAmountRatio']], left_on=['Location', 'LocationSystem', 'SectorProducedBy'], right_on=['Location', 'LocationSystem', 'Sector'], how='left') fbs = fbs.merge( flow_allocation[['Location', 'LocationSystem', 'Sector', 'FlowAmountRatio']], left_on=['Location', 'LocationSystem', 'SectorConsumedBy'], right_on=['Location', 'LocationSystem', 'Sector'], how='left') # drop columns where both sector produced/consumed by in flow allocation dif is null fbs = fbs.dropna(subset=['Sector_x', 'Sector_y'], how='all').reset_index() # merge the flowamount columns fbs['FlowAmountRatio'] = fbs['FlowAmountRatio_x'].fillna(fbs['FlowAmountRatio_y']) fbs['FlowAmountRatio'] = fbs['FlowAmountRatio'].fillna(0) # calculate flow amounts for each sector log.info("Calculating new flow amounts using flow ratios") fbs['FlowAmount'] = fbs['FlowAmount'] * fbs['FlowAmountRatio'] # drop columns log.info("Cleaning up new flow by sector") fbs = fbs.drop(columns=['Sector_x', 'FlowAmountRatio_x', 'Sector_y', 'FlowAmountRatio_y', 'FlowAmountRatio', 'ActivityProducedBy', 'ActivityConsumedBy']) # rename flow name to flowable fbs = fbs.rename(columns={"FlowName": 'Flowable', "Compartment": "Context" }) # drop rows where flowamount = 0 (although this includes dropping suppressed data) fbs = fbs[fbs['FlowAmount'] != 0].reset_index(drop=True) # add missing data columns fbs = add_missing_flow_by_fields(fbs, flow_by_sector_fields) # fill null values fbs = fbs.fillna(value=fbs_fill_na_dict) # aggregate df geographically, if necessary log.info("Aggregating flowbysector to " + method['target_geoscale'] + " level") if fips_number_key[v['geoscale_to_use']] < fips_number_key[attr['allocation_from_scale']]: from_scale = v['geoscale_to_use'] else: from_scale = attr['allocation_from_scale'] to_scale = method['target_geoscale'] fbs = agg_by_geoscale(fbs, from_scale, to_scale, fbs_default_grouping_fields, names) # aggregate data to every sector level log.info("Aggregating flowbysector to " + method['target_sector_level']) fbs = sector_aggregation(fbs, fbs_default_grouping_fields) # test agg by sector sector_agg_comparison = sector_flow_comparision(fbs) # return sector level specified in method yaml # load the crosswalk linking sector lengths cw = load_sector_length_crosswalk() sector_list = cw[method['target_sector_level']].unique().tolist() # add any non-NAICS sectors used with NAICS household = load_household_sector_codes() household = household.loc[household['NAICS_Level_to_Use_For'] == method['target_sector_level']] # add household sector to sector list sector_list.extend(household['Code'].tolist()) # subset df fbs = fbs.loc[(fbs[fbs_activity_fields[0]].isin(sector_list)) | (fbs[fbs_activity_fields[1]].isin(sector_list))].reset_index(drop=True) # add any missing columns of data and cast to appropriate data type fbs = add_missing_flow_by_fields(fbs, flow_by_sector_fields) log.info("Completed flowbysector for activity subset with flows " + ', '.join(map(str, names))) fbss.append(fbs) # create single df of all activities fbss = pd.concat(fbss, ignore_index=True, sort=False) # aggregate df as activities might have data for the same specified sector length fbss = aggregator(fbss, fbs_default_grouping_fields) # sort df fbss = fbss.sort_values( ['SectorProducedBy', 'SectorConsumedBy', 'Flowable', 'Context']).reset_index(drop=True) # save parquet file store_flowbysector(fbss, method_name)
def stewicombo_to_sector(inventory_dict, NAICS_level, geo_level, compartments): """ This function takes the following inputs: - inventory_dict: a dictionary of inventory types and years (e.g., {'NEI':'2017', 'TRI':'2017'}) - NAICS_level: desired NAICS aggregation level (2-6) - geo_level: desired geographic aggregation level ('National', 'State', 'County') - compartments: list of compartments to include (e.g., 'water', 'air', 'land') """ ## run stewicombo to combine inventories, filter for LCI, remove overlap df = stewicombo.combineFullInventories(inventory_dict, filter_for_LCI=True, remove_overlap=True, compartments=compartments) ## create mapping to convert facility IDs --> NAICS codes facility_mapping = pd.DataFrame() # for all inventories in list: # - load facility data from stewi output directory, keeping only the facility IDs, NAICS codes, and geographic information # - create new column indicating inventory source (database and year) # - append data to master data frame for i in range(len(inventory_dict)): # define inventory name as inventory type + inventory year (e.g., NEI_2017) inventory_name = list(inventory_dict.keys())[i] + '_' + list( inventory_dict.values())[i] facilities = pd.read_csv( stw_output_dir + 'facility/' + inventory_name + '.csv', usecols=['FacilityID', 'NAICS', 'State', 'County'], dtype={ 'FacilityID': str, 'NAICS': int }) # rename counties as County + State (e.g., Bristol_MA), since some states share county names facilities['County'] = facilities['County'] + '_' + facilities['State'] facilities['SourceYear'] = inventory_name facility_mapping = facility_mapping.append(facilities) ## merge dataframes to assign NAICS codes based on facility IDs df['SourceYear'] = df['Source'] + '_' + df['Year'] df = pd.merge(df, facility_mapping, how='left', left_on=['FacilityID', 'SourceYear'], right_on=['FacilityID', 'SourceYear']) ## subtract emissions for air transportation from airports # PLACEHOLDER TO SUBTRACT EMISSIONS FOR AIR TRANSPORT ## aggregate data based on NAICS code and chemical ID # add levelized NAICS code df['NAICS_lvl'] = df['NAICS'].astype(str).str[0:NAICS_level] # assign grouping variables based on desired geographic aggregation level if geo_level == 'National': grouping_vars = ['NAICS_lvl', 'SRS_ID', 'Compartment'] elif geo_level == 'State': grouping_vars = ['NAICS_lvl', 'SRS_ID', 'Compartment', 'State'] elif geo_level == 'County': grouping_vars = ['NAICS_lvl', 'SRS_ID', 'Compartment', 'County'] # aggregate by NAICS code, chemical ID, compartment, and geographic level fbs = df.groupby(grouping_vars).agg({ 'FlowAmount': 'sum', 'NAICS_lvl': 'first', 'Compartment': 'first', 'FlowName': 'first', 'Year': 'first', 'Unit': 'first', 'State': 'first', 'County': 'first' }) # add reliability score fbs['DataReliability'] = weighted_average(df, 'ReliabilityScore', 'FlowAmount', grouping_vars) ## perform operations to match flowbysector format # rename columns to match flowbysector format fbs = fbs.rename( columns={ "NAICS_lvl": "SectorProducedBy", "FlowName": "Flowable", "Compartment": "Context" }) # add hardcoded data fbs['National'] = 'United States' fbs['Class'] = 'Chemicals' fbs['SectorConsumedBy'] = 'None' fbs['Location'] = fbs[geo_level] fbs = assign_fips_location_system(fbs, list(inventory_dict.values())[0]) # add missing flow by sector fields fbs = add_missing_flow_by_fields(fbs, flow_by_sector_fields) # sort dataframe and reset index fbs = fbs.sort_values(list( flow_by_sector_fields.keys())).reset_index(drop=True) ## save result to output directory fbs.to_csv(output_dir + 'Chemicals_' + geo_level + '.csv')
def stewicombo_to_sector(inventory_dict, NAICS_level, geo_scale, compartments): """ Returns emissions from stewicombo in fbs format :param inventory_dict: a dictionary of inventory types and years (e.g., {'NEI':'2017', 'TRI':'2017'}) :param NAICS_level: desired NAICS aggregation level, using sector_level_key, should match target_sector_level :param geo_scale: desired geographic aggregation level ('national', 'state', 'county'), should match target_geoscale :param compartments: list of compartments to include (e.g., 'water', 'air', 'soil'), use None to include all compartments """ from stewi.globals import output_dir as stw_output_dir from stewi.globals import weighted_average import stewi import stewicombo import facilitymatcher from stewicombo.overlaphandler import remove_default_flow_overlaps from stewicombo.globals import addChemicalMatches from facilitymatcher import output_dir as fm_output_dir NAICS_level_value = sector_level_key[NAICS_level] ## run stewicombo to combine inventories, filter for LCI, remove overlap df = stewicombo.combineFullInventories(inventory_dict, filter_for_LCI=True, remove_overlap=True, compartments=compartments) df.drop(columns=['SRS_CAS', 'SRS_ID', 'FacilityIDs_Combined'], inplace=True) facility_mapping = pd.DataFrame() # load facility data from stewi output directory, keeping only the facility IDs, and geographic information inventory_list = list(inventory_dict.keys()) for i in range(len(inventory_dict)): # define inventory name as inventory type + inventory year (e.g., NEI_2017) inventory_name = inventory_list[i] + '_' + list( inventory_dict.values())[i] facilities = pd.read_csv(stw_output_dir + 'facility/' + inventory_name + '.csv', usecols=['FacilityID', 'State', 'County'], dtype={'FacilityID': str}) if len(facilities[facilities.duplicated(subset='FacilityID', keep=False)]) > 0: log.info('Duplicate facilities in ' + inventory_name + ' - keeping first listed') facilities.drop_duplicates(subset='FacilityID', keep='first', inplace=True) facility_mapping = facility_mapping.append(facilities) # Apply FIPS to facility locations facility_mapping = apply_county_FIPS(facility_mapping) ## merge dataframes to assign facility information based on facility IDs df = pd.merge(df, facility_mapping, how='left', on='FacilityID') ## Access NAICS From facility matcher and assign based on FRS_ID all_NAICS = facilitymatcher.get_FRS_NAICSInfo_for_facility_list( frs_id_list=None, inventories_of_interest_list=inventory_list) all_NAICS = all_NAICS.loc[all_NAICS['PRIMARY_INDICATOR'] == 'PRIMARY'] all_NAICS.drop(columns=['PRIMARY_INDICATOR'], inplace=True) all_NAICS = naics_expansion(all_NAICS) if len(all_NAICS[all_NAICS.duplicated(subset=['FRS_ID', 'Source'], keep=False)]) > 0: log.info('Duplicate primary NAICS reported - keeping first') all_NAICS.drop_duplicates(subset=['FRS_ID', 'Source'], keep='first', inplace=True) df = pd.merge(df, all_NAICS, how='left', on=['FRS_ID', 'Source']) # add levelized NAICS code prior to aggregation df['NAICS_lvl'] = df['NAICS'].str[0:NAICS_level_value] ## subtract emissions for air transportation from airports in NEI airport_NAICS = '4881' air_transportation_SCC = '2275020000' air_transportation_naics = '481111' if 'NEI' in inventory_list: log.info('Reassigning emissions from air transportation from airports') # obtain and prepare SCC dataset df_airplanes = stewi.getInventory('NEI', inventory_dict['NEI'], stewiformat='flowbySCC') df_airplanes = df_airplanes[df_airplanes['SCC'] == air_transportation_SCC] df_airplanes['Source'] = 'NEI' df_airplanes = addChemicalMatches(df_airplanes) df_airplanes = remove_default_flow_overlaps(df_airplanes, SCC=True) df_airplanes.drop(columns=['SCC'], inplace=True) facility_mapping_air = df[['FacilityID', 'NAICS']] facility_mapping_air.drop_duplicates(keep='first', inplace=True) df_airplanes = df_airplanes.merge(facility_mapping_air, how='left', on='FacilityID') df_airplanes['Year'] = inventory_dict['NEI'] df_airplanes = df_airplanes[ df_airplanes['NAICS'].str[0:len(airport_NAICS)] == airport_NAICS] # subtract airplane emissions from airport NAICS at individual facilities df_planeemissions = df_airplanes[[ 'FacilityID', 'FlowName', 'FlowAmount' ]] df_planeemissions.rename(columns={'FlowAmount': 'PlaneEmissions'}, inplace=True) df = df.merge(df_planeemissions, how='left', on=['FacilityID', 'FlowName']) df[['PlaneEmissions']] = df[['PlaneEmissions']].fillna(value=0) df['FlowAmount'] = df['FlowAmount'] - df['PlaneEmissions'] df.drop(columns=['PlaneEmissions'], inplace=True) # add airplane emissions under air transport NAICS df_airplanes.loc[:, 'NAICS_lvl'] = air_transportation_naics[ 0:NAICS_level_value] df = pd.concat([df, df_airplanes], ignore_index=True) # update location to appropriate geoscale prior to aggregating df.dropna(subset=['Location'], inplace=True) df['Location'] = df['Location'].astype(str) df = update_geoscale(df, geo_scale) # assign grouping variables based on desired geographic aggregation level grouping_vars = ['NAICS_lvl', 'FlowName', 'Compartment', 'Location'] # aggregate by NAICS code, FlowName, compartment, and geographic level fbs = df.groupby(grouping_vars).agg({ 'FlowAmount': 'sum', 'Year': 'first', 'Unit': 'first' }) # add reliability score fbs['DataReliability'] = weighted_average(df, 'ReliabilityScore', 'FlowAmount', grouping_vars) fbs.reset_index(inplace=True) # apply flow mapping fbs = map_elementary_flows(fbs, inventory_list) # rename columns to match flowbysector format fbs = fbs.rename(columns={"NAICS_lvl": "SectorProducedBy"}) # add hardcoded data, depending on the source data, some of these fields may need to change fbs['Class'] = 'Chemicals' fbs['SectorConsumedBy'] = 'None' fbs['SectorSourceName'] = 'NAICS_2012_Code' fbs['FlowType'] = 'ELEMENTARY_FLOW' fbs = assign_fips_location_system(fbs, list(inventory_dict.values())[0]) # add missing flow by sector fields fbs = add_missing_flow_by_fields(fbs, flow_by_sector_fields) # sort dataframe and reset index fbs = fbs.sort_values(list( flow_by_sector_fields.keys())).reset_index(drop=True) return fbs