def extract_facility_data(inventory_dict): """ Returns df of facilities from each inventory in inventory_dict, including FIPS code :param inventory_dict: a dictionary of inventory types and years (e.g., {'NEI':'2017', 'TRI':'2017'}) :return: df """ import stewi facility_mapping = pd.DataFrame() # load facility data from stewi output directory, keeping only the facility IDs, # and geographic information inventory_list = list(inventory_dict.keys()) for i in range(len(inventory_dict)): # define inventory name as inventory type + inventory year (e.g., NEI_2017) database = inventory_list[i] year = list(inventory_dict.values())[i] inventory_name = database + '_' + year facilities = stewi.getInventoryFacilities(database, year) facilities = facilities[['FacilityID', 'State', 'County', 'NAICS']] if len(facilities[facilities.duplicated(subset='FacilityID', keep=False)]) > 0: log.debug('Duplicate facilities in %s - keeping first listed', inventory_name) facilities.drop_duplicates(subset='FacilityID', keep='first', inplace=True) facility_mapping = facility_mapping.append(facilities) # Apply FIPS to facility locations facility_mapping = apply_county_FIPS(facility_mapping) return facility_mapping
def obtain_NAICS_from_facility_matcher(inventory_list): """ Returns dataframe of all facilities with included in inventory_list with their first or primary NAICS. :param inventory_list: a list of inventories (e.g., ['NEI', 'TRI']) :return: df """ import facilitymatcher ## Access NAICS From facility matcher and assign based on FRS_ID all_NAICS = \ facilitymatcher.get_FRS_NAICSInfo_for_facility_list( frs_id_list=None, inventories_of_interest_list=inventory_list) all_NAICS = all_NAICS.loc[all_NAICS['PRIMARY_INDICATOR'] == 'PRIMARY'] all_NAICS.drop(columns=['PRIMARY_INDICATOR'], inplace=True) all_NAICS = naics_expansion(all_NAICS) if len(all_NAICS[all_NAICS.duplicated(subset=['FRS_ID', 'Source'], keep=False)]) > 0: log.debug('Duplicate primary NAICS reported - keeping first') all_NAICS.drop_duplicates(subset=['FRS_ID', 'Source'], keep='first', inplace=True) return all_NAICS
def direct_allocation_method(flow_subset_mapped, k, names, method): """ Directly assign activities to sectors :param flow_subset_mapped: df, FBA with flows converted using fedelemflowlist :param k: str, source name :param names: list, activity names in activity set :param method: dictionary, FBS method yaml :return: df with sector columns """ log.info('Directly assigning activities to sectors') fbs = flow_subset_mapped.copy() # for each activity, if activities are not sector like, check that there is no data loss if load_source_catalog()[k]['sector-like_activities'] is False: activity_list = [] for n in names: log.debug('Checking for %s at %s', n, method['target_sector_level']) fbs_subset = fbs[((fbs[fba_activity_fields[0]] == n) & (fbs[fba_activity_fields[1]] == n)) | (fbs[fba_activity_fields[0]] == n) | (fbs[fba_activity_fields[1]] == n)].reset_index(drop=True) fbs_subset = allocate_dropped_sector_data(fbs_subset, method['target_sector_level']) activity_list.append(fbs_subset) fbs = pd.concat(activity_list, ignore_index=True) return fbs
def check_flow_by_fields(flowby_df, flowbyfields): """ Add in missing fields to have a complete and ordered :param flowby_df: Either flowbyactivity or flowbysector df :param flowbyfields: Either flow_by_activity_fields or flow_by_sector_fields :return: """ for k, v in flowbyfields.items(): try: log.debug("fba activity " + k + " data type is " + str(flowby_df[k].values.dtype)) log.debug("standard " + k + " data type is " + str(v[0]['dtype'])) except: log.debug("Failed to find field ", k, " in fba")
def dataset_allocation_method(flow_subset_mapped, attr, names, method, k, v, aset, method_name, aset_names): """ Method of allocation using a specified data source :param flow_subset_mapped: FBA subset mapped using federal elementary flow list :param attr: dictionary, attribute data from method yaml for activity set :param names: list, activity names in activity set :param method: dictionary, FBS method yaml :param k: str, the datasource name :param v: dictionary, the datasource parameters :param aset: dictionary items for FBS method yaml :param method_name: str, method ame :param aset_names: list, activity set names :return: df, allocated activity names """ from flowsa.validation import compare_df_units # add parameters to dictionary if exist in method yaml fba_dict = {} if 'allocation_flow' in attr: fba_dict['flowname_subset'] = attr['allocation_flow'] if 'allocation_compartment' in attr: fba_dict['compartment_subset'] = attr['allocation_compartment'] if 'clean_allocation_fba' in attr: fba_dict['clean_fba'] = attr['clean_allocation_fba'] if 'clean_allocation_fba_w_sec' in attr: fba_dict['clean_fba_w_sec'] = attr['clean_allocation_fba_w_sec'] # load the allocation FBA fba_allocation_wsec = load_map_clean_fba(method, attr, fba_sourcename=attr['allocation_source'], df_year=attr['allocation_source_year'], flowclass=attr['allocation_source_class'], geoscale_from=attr['allocation_from_scale'], geoscale_to=v['geoscale_to_use'], **fba_dict) # subset fba datasets to only keep the sectors associated with activity subset log.info("Subsetting %s for sectors in %s", attr['allocation_source'], k) fba_allocation_subset = get_fba_allocation_subset(fba_allocation_wsec, k, names, flowSubsetMapped=flow_subset_mapped, allocMethod=attr['allocation_method']) # if there is an allocation helper dataset, modify allocation df if 'helper_source' in attr: log.info("Using the specified allocation help for subset of %s", attr['allocation_source']) fba_allocation_subset = allocation_helper(fba_allocation_subset, attr, method, v) # create flow allocation ratios for each activity # if load_source_catalog()[k]['sector-like_activities'] flow_alloc_list = [] group_cols = fba_wsec_default_grouping_fields group_cols = [e for e in group_cols if e not in ('ActivityProducedBy', 'ActivityConsumedBy')] for n in names: log.debug("Creating allocation ratios for %s", n) fba_allocation_subset_2 = get_fba_allocation_subset(fba_allocation_subset, k, [n], flowSubsetMapped=flow_subset_mapped, allocMethod=attr['allocation_method'], activity_set_names=aset_names) if len(fba_allocation_subset_2) == 0: log.info("No data found to allocate %s", n) else: flow_alloc = allocate_by_sector(fba_allocation_subset_2, attr['allocation_method'], group_cols, flowSubsetMapped=flow_subset_mapped) flow_alloc = flow_alloc.assign(FBA_Activity=n) flow_alloc_list.append(flow_alloc) flow_allocation = pd.concat(flow_alloc_list, ignore_index=True) # generalize activity field names to enable link to main fba source log.info("Generalizing activity columns in subset of %s", attr['allocation_source']) flow_allocation = collapse_activity_fields(flow_allocation) # check for issues with allocation ratios check_allocation_ratios(flow_allocation, aset, method) # create list of sectors in the flow allocation df, drop any rows of data in the flow df that \ # aren't in list sector_list = flow_allocation['Sector'].unique().tolist() # subset fba allocation table to the values in the activity list, based on overlapping sectors flow_subset_mapped = flow_subset_mapped.loc[ (flow_subset_mapped[fbs_activity_fields[0]].isin(sector_list)) | (flow_subset_mapped[fbs_activity_fields[1]].isin(sector_list))] # check if fba and allocation dfs have the same LocationSystem log.info("Checking if flowbyactivity and allocation dataframes use the same location systems") check_if_location_systems_match(flow_subset_mapped, flow_allocation) # merge fba df w/flow allocation dataset log.info("Merge %s and subset of %s", k, attr['allocation_source']) for i, j in activity_fields.items(): # check units compare_df_units(flow_subset_mapped, flow_allocation) flow_subset_mapped = flow_subset_mapped.merge( flow_allocation[['Location', 'Sector', 'FlowAmountRatio', 'FBA_Activity']], left_on=['Location', j[1]["flowbysector"], j[0]["flowbyactivity"]], right_on=['Location', 'Sector', 'FBA_Activity'], how='left') # merge the flowamount columns flow_subset_mapped.loc[:, 'FlowAmountRatio'] =\ flow_subset_mapped['FlowAmountRatio_x'].fillna(flow_subset_mapped['FlowAmountRatio_y']) # fill null rows with 0 because no allocation info flow_subset_mapped['FlowAmountRatio'] = flow_subset_mapped['FlowAmountRatio'].fillna(0) # drop rows where there is no allocation data fbs = flow_subset_mapped.dropna(subset=['Sector_x', 'Sector_y'], how='all').reset_index() # calculate flow amounts for each sector log.info("Calculating new flow amounts using flow ratios") fbs.loc[:, 'FlowAmount'] = fbs['FlowAmount'] * fbs['FlowAmountRatio'] # drop columns log.info("Cleaning up new flow by sector") fbs = fbs.drop(columns=['Sector_x', 'FlowAmountRatio_x', 'Sector_y', 'FlowAmountRatio_y', 'FlowAmountRatio', 'FBA_Activity_x', 'FBA_Activity_y']) return fbs
def get_fba_allocation_subset(fba_allocation, source, activitynames, **kwargs): """ Subset the fba allocation data based on NAICS associated with activity :param fba_allocation: df, FBA format :param source: str, source name :param activitynames: list, activity names in activity set :param kwargs: can be the mapping file and method of allocation :return: df, FBA subset """ # first determine if there are special cases that would modify the typical method of subset # an example of a special case is when the allocation method is 'proportional-flagged' subset_by_sector_cols = False subset_by_column_value = False if kwargs != {}: if 'flowSubsetMapped' in kwargs: fsm = kwargs['flowSubsetMapped'] if 'allocMethod' in kwargs: am = kwargs['allocMethod'] if am == 'proportional-flagged': subset_by_sector_cols = True if 'activity_set_names' in kwargs: asn = kwargs['activity_set_names'] if asn is not None: if 'allocation_subset_col' in asn: subset_by_column_value = True # load the source catalog cat = load_source_catalog() src_info = cat[source] if src_info['sector-like_activities'] is False: # read in source crosswalk df = get_activitytosector_mapping(source) sec_source_name = df['SectorSourceName'][0] df = expand_naics_list(df, sec_source_name) # subset source crosswalk to only contain values pertaining to list of activity names df = df.loc[df['Activity'].isin(activitynames)] # turn column of sectors related to activity names into list sector_list = pd.unique(df['Sector']).tolist() # subset fba allocation table to the values in # the activity list, based on overlapping sectors if 'Sector' in fba_allocation: fba_allocation_subset =\ fba_allocation.loc[fba_allocation['Sector'].isin( sector_list)].reset_index(drop=True) else: fba_allocation_subset = \ fba_allocation.loc[(fba_allocation[fbs_activity_fields[0]].isin(sector_list)) | (fba_allocation[fbs_activity_fields[1]].isin(sector_list))]. \ reset_index(drop=True) else: if 'Sector' in fba_allocation: fba_allocation_subset =\ fba_allocation.loc[fba_allocation['Sector'].isin( activitynames)].reset_index(drop=True) elif subset_by_sector_cols: # if it is a special case, then base the subset of data on # sectors in the sector columns, not on activitynames fsm_sub = fsm.loc[ (fsm[fba_activity_fields[0]].isin(activitynames)) | (fsm[fba_activity_fields[1]].isin(activitynames))].reset_index( drop=True) part1 = fsm_sub[['SectorConsumedBy']] part2 = fsm_sub[['SectorProducedBy']] part1.columns = ['Sector'] part2.columns = ['Sector'] modified_activitynames = pd.concat( [part1, part2], ignore_index=True).drop_duplicates() modified_activitynames =\ modified_activitynames[modified_activitynames['Sector'].notnull()] modified_activitynames = modified_activitynames['Sector'].tolist() fba_allocation_subset = \ fba_allocation.loc[ (fba_allocation[fbs_activity_fields[0]].isin(modified_activitynames)) | (fba_allocation[fbs_activity_fields[1]].isin(modified_activitynames))]. \ reset_index(drop=True) else: fba_allocation_subset =\ fba_allocation.loc[(fba_allocation[fbs_activity_fields[0]].isin(activitynames)) | (fba_allocation[fbs_activity_fields[1]].isin(activitynames))].\ reset_index(drop=True) # if activity set names included in function call and activity set names is not null, \ # then subset data based on value and column specified if subset_by_column_value: # create subset of activity names and allocation subset metrics asn_subset = asn[asn['name'].isin(activitynames)].reset_index( drop=True) if asn_subset['allocation_subset'].isna().all(): pass elif asn_subset['allocation_subset'].isna().any(): log.error( 'Define column and value to subset on in the activity set csv for all rows' ) else: col_to_subset = asn_subset['allocation_subset_col'][0] val_to_subset = asn_subset['allocation_subset'][0] # subset fba_allocation_subset further log.debug('Subset the allocation dataset where %s = %s', str(col_to_subset), str(val_to_subset)) fba_allocation_subset = fba_allocation_subset[ fba_allocation_subset[col_to_subset] == val_to_subset].reset_index(drop=True) return fba_allocation_subset