def add_sectors_to_flowbyactivity(flowbyactivity_df, sectorsourcename=sector_source_name, **kwargs): """ Add Sectors from the Activity fields and mapped them to Sector from the crosswalk. No allocation is performed. :param flowbyactivity_df: A standard flowbyactivity data frame :param sectorsourcename: A sector source name, using package default :param kwargs: option to include the parameter 'allocationmethod', which modifies function behavoir if = 'direct' :return: a df with activity fields mapped to 'sectors' """ # First check if source activities are NAICS like - if so make it into a mapping file cat = load_source_catalog() # for s in pd.unique(flowbyactivity_df['SourceName']): s = pd.unique(flowbyactivity_df['SourceName'])[0] # load catalog info for source src_info = cat[s] # if activities are sector-like, check if need to modify mapping if 'modify_sector-like_activities' in src_info: modify_sector_like_activities = src_info[ 'modify_sector-like_activities'] else: modify_sector_like_activities = False # read the pre-determined level of sector aggregation of each crosswalk from the source catalog levelofSectoragg = src_info['sector_aggregation_level'] # if the FBS activity set is 'direct', overwrite the levelofsectoragg, or if specified in fxn call if kwargs != {}: if 'allocationmethod' in kwargs: if kwargs['allocationmethod'] == 'direct': levelofSectoragg = 'disaggregated' if 'overwrite_sectorlevel' in kwargs: levelofSectoragg = kwargs['overwrite_sectorlevel'] # if data are provided in NAICS format, use the mastercrosswalk if src_info[ 'sector-like_activities'] and modify_sector_like_activities is False: cw = load_sector_crosswalk() sectors = cw.loc[:, [sector_source_name]] # Create mapping df that's just the sectors at first mapping = sectors.drop_duplicates() # Add the sector twice as activities so mapping is identical mapping = mapping.assign(Activity=sectors[sector_source_name]) mapping = mapping.rename(columns={sector_source_name: "Sector"}) # add columns so can run expand_naics_list_fxn # if sector-like_activities = True, missing columns, so add mapping['ActivitySourceName'] = s # tmp assignment mapping['SectorType'] = None # Include all digits of naics in mapping, if levelofNAICSagg is specified as "aggregated" if levelofSectoragg == 'aggregated': mapping = expand_naics_list(mapping, sectorsourcename) else: # if source data activities are text strings, or sector-like activities should be modified, \ # call on the manually created source crosswalks mapping = get_activitytosector_mapping(s) # filter by SectorSourceName of interest mapping = mapping[mapping['SectorSourceName'] == sectorsourcename] # drop SectorSourceName mapping = mapping.drop(columns=['SectorSourceName']) # Include all digits of naics in mapping, if levelofNAICSagg is specified as "aggregated" if levelofSectoragg == 'aggregated': mapping = expand_naics_list(mapping, sectorsourcename) # Merge in with flowbyactivity by flowbyactivity_wsector_df = flowbyactivity_df for k, v in activity_fields.items(): sector_direction = k flowbyactivity_field = v[0]["flowbyactivity"] flowbysector_field = v[1]["flowbysector"] sector_type_field = sector_direction + 'SectorType' mappings_df_tmp = mapping.rename( columns={ 'Activity': flowbyactivity_field, 'Sector': flowbysector_field, 'SectorType': sector_type_field }) # column doesn't exist for sector-like activities, so ignore if error occurs mappings_df_tmp = mappings_df_tmp.drop(columns=['ActivitySourceName'], errors='ignore') # Merge them in. Critical this is a left merge to preserve all unmapped rows flowbyactivity_wsector_df = pd.merge(flowbyactivity_wsector_df, mappings_df_tmp, how='left', on=flowbyactivity_field) flowbyactivity_wsector_df = flowbyactivity_wsector_df.replace( {np.nan: None}) # add sector source name flowbyactivity_wsector_df = flowbyactivity_wsector_df.assign( SectorSourceName=sectorsourcename) # if activities are sector-like check that the sectors are in the crosswalk if src_info['sector-like_activities']: flowbyactivity_wsector_df = replace_naics_w_naics_2012( flowbyactivity_wsector_df, sectorsourcename) return flowbyactivity_wsector_df
def add_sectors_to_flowbyactivity(flowbyactivity_df, sectorsourcename=sector_source_name): """ Add Sectors from the Activity fields and mapped them to Sector from the crosswalk. No allocation is performed. :param flowbyactivity_df: A standard flowbyactivity data frame :param sectorsourcename: A sector source name, using package default :return: a df with activity fields mapped to 'sectors' """ mappings = [] # First check if source activities are NAICS like - if so make it into a mapping file cat = load_source_catalog() for s in pd.unique(flowbyactivity_df['SourceName']): src_info = cat[s] # read the pre-determined level of sector aggregation of each crosswalk from the source catalog levelofSectoragg = src_info['sector_aggregation_level'] # if data are provided in NAICS format, use the mastercrosswalk if src_info['sector-like_activities']: cw = load_sector_crosswalk() sectors = cw.loc[:, [sector_source_name]] # Create mapping df that's just the sectors at first mapping = sectors.drop_duplicates() # Add the sector twice as activities so mapping is identical mapping = mapping.assign(Activity=sectors[sector_source_name]) mapping = mapping.rename(columns={sector_source_name: "Sector"}) # add columns so can run expand_naics_list_fxn # if sector-like_activities = True, missing columns, so add mapping['ActivitySourceName'] = s # tmp assignment mapping['SectorType'] = None # Include all digits of naics in mapping, if levelofNAICSagg is specified as "aggregated" if levelofSectoragg == 'aggregated': mapping = expand_naics_list(mapping, sectorsourcename) else: # if source data activities are text strings, call on the manually created source crosswalks mapping = get_activitytosector_mapping(s) # filter by SectorSourceName of interest mapping = mapping[mapping['SectorSourceName'] == sectorsourcename] # drop SectorSourceName mapping = mapping.drop(columns=['SectorSourceName']) # Include all digits of naics in mapping, if levelofNAICSagg is specified as "aggregated" if levelofSectoragg == 'aggregated': mapping = expand_naics_list(mapping, sectorsourcename) mappings.append(mapping) mappings_df = pd.concat(mappings, sort=False) # Merge in with flowbyactivity by flowbyactivity_wsector_df = flowbyactivity_df for k, v in activity_fields.items(): sector_direction = k flowbyactivity_field = v[0]["flowbyactivity"] flowbysector_field = v[1]["flowbysector"] sector_type_field = sector_direction + 'SectorType' mappings_df_tmp = mappings_df.rename( columns={ 'Activity': flowbyactivity_field, 'Sector': flowbysector_field, 'SectorType': sector_type_field }) # column doesn't exist for sector-like activities, so ignore if error occurs mappings_df_tmp = mappings_df_tmp.drop(columns=['ActivitySourceName'], errors='ignore') # Merge them in. Critical this is a left merge to preserve all unmapped rows flowbyactivity_wsector_df = pd.merge(flowbyactivity_wsector_df, mappings_df_tmp, how='left', on=flowbyactivity_field) flowbyactivity_wsector_df = flowbyactivity_wsector_df.replace( {np.nan: None}) return flowbyactivity_wsector_df
def dataset_allocation_method(flow_subset_mapped, attr, names, method, k, v, aset, method_name, aset_names): """ Method of allocation using a specified data source :param flow_subset_mapped: FBA subset mapped using federal elementary flow list :param attr: dictionary, attribute data from method yaml for activity set :param names: list, activity names in activity set :param method: dictionary, FBS method yaml :param k: str, the datasource name :param v: dictionary, the datasource parameters :param aset: dictionary items for FBS method yaml :param method_name: str, method ame :param aset_names: list, activity set names :return: df, allocated activity names """ from flowsa.validation import compare_df_units # add parameters to dictionary if exist in method yaml fba_dict = {} if 'allocation_flow' in attr: fba_dict['flowname_subset'] = attr['allocation_flow'] if 'allocation_compartment' in attr: fba_dict['compartment_subset'] = attr['allocation_compartment'] if 'clean_allocation_fba' in attr: fba_dict['clean_fba'] = attr['clean_allocation_fba'] if 'clean_allocation_fba_w_sec' in attr: fba_dict['clean_fba_w_sec'] = attr['clean_allocation_fba_w_sec'] # load the allocation FBA fba_allocation_wsec = load_map_clean_fba(method, attr, fba_sourcename=attr['allocation_source'], df_year=attr['allocation_source_year'], flowclass=attr['allocation_source_class'], geoscale_from=attr['allocation_from_scale'], geoscale_to=v['geoscale_to_use'], **fba_dict) # subset fba datasets to only keep the sectors associated with activity subset log.info("Subsetting %s for sectors in %s", attr['allocation_source'], k) fba_allocation_subset = get_fba_allocation_subset(fba_allocation_wsec, k, names, flowSubsetMapped=flow_subset_mapped, allocMethod=attr['allocation_method']) # if there is an allocation helper dataset, modify allocation df if 'helper_source' in attr: log.info("Using the specified allocation help for subset of %s", attr['allocation_source']) fba_allocation_subset = allocation_helper(fba_allocation_subset, attr, method, v) # create flow allocation ratios for each activity # if load_source_catalog()[k]['sector-like_activities'] flow_alloc_list = [] group_cols = fba_wsec_default_grouping_fields group_cols = [e for e in group_cols if e not in ('ActivityProducedBy', 'ActivityConsumedBy')] for n in names: log.debug("Creating allocation ratios for %s", n) fba_allocation_subset_2 = get_fba_allocation_subset(fba_allocation_subset, k, [n], flowSubsetMapped=flow_subset_mapped, allocMethod=attr['allocation_method'], activity_set_names=aset_names) if len(fba_allocation_subset_2) == 0: log.info("No data found to allocate %s", n) else: flow_alloc = allocate_by_sector(fba_allocation_subset_2, attr['allocation_method'], group_cols, flowSubsetMapped=flow_subset_mapped) flow_alloc = flow_alloc.assign(FBA_Activity=n) flow_alloc_list.append(flow_alloc) flow_allocation = pd.concat(flow_alloc_list, ignore_index=True) # generalize activity field names to enable link to main fba source log.info("Generalizing activity columns in subset of %s", attr['allocation_source']) flow_allocation = collapse_activity_fields(flow_allocation) # check for issues with allocation ratios check_allocation_ratios(flow_allocation, aset, method) # create list of sectors in the flow allocation df, drop any rows of data in the flow df that \ # aren't in list sector_list = flow_allocation['Sector'].unique().tolist() # subset fba allocation table to the values in the activity list, based on overlapping sectors flow_subset_mapped = flow_subset_mapped.loc[ (flow_subset_mapped[fbs_activity_fields[0]].isin(sector_list)) | (flow_subset_mapped[fbs_activity_fields[1]].isin(sector_list))] # check if fba and allocation dfs have the same LocationSystem log.info("Checking if flowbyactivity and allocation dataframes use the same location systems") check_if_location_systems_match(flow_subset_mapped, flow_allocation) # merge fba df w/flow allocation dataset log.info("Merge %s and subset of %s", k, attr['allocation_source']) for i, j in activity_fields.items(): # check units compare_df_units(flow_subset_mapped, flow_allocation) flow_subset_mapped = flow_subset_mapped.merge( flow_allocation[['Location', 'Sector', 'FlowAmountRatio', 'FBA_Activity']], left_on=['Location', j[1]["flowbysector"], j[0]["flowbyactivity"]], right_on=['Location', 'Sector', 'FBA_Activity'], how='left') # merge the flowamount columns flow_subset_mapped.loc[:, 'FlowAmountRatio'] =\ flow_subset_mapped['FlowAmountRatio_x'].fillna(flow_subset_mapped['FlowAmountRatio_y']) # fill null rows with 0 because no allocation info flow_subset_mapped['FlowAmountRatio'] = flow_subset_mapped['FlowAmountRatio'].fillna(0) # drop rows where there is no allocation data fbs = flow_subset_mapped.dropna(subset=['Sector_x', 'Sector_y'], how='all').reset_index() # calculate flow amounts for each sector log.info("Calculating new flow amounts using flow ratios") fbs.loc[:, 'FlowAmount'] = fbs['FlowAmount'] * fbs['FlowAmountRatio'] # drop columns log.info("Cleaning up new flow by sector") fbs = fbs.drop(columns=['Sector_x', 'FlowAmountRatio_x', 'Sector_y', 'FlowAmountRatio_y', 'FlowAmountRatio', 'FBA_Activity_x', 'FBA_Activity_y']) return fbs