def proportional_allocation_by_location_and_sector(df, sectorcolumn): """ Creates a proportional allocation within each aggregated sector within a location :param df: :param sectorcolumn: :return: """ from flowsa.common import load_source_catalog cat = load_source_catalog() src_info = cat[pd.unique(df['SourceName'])[0]] # load source catalog to determine the level of sector aggregation associated with a crosswalk level_of_aggregation = src_info['sector_aggregation_level'] # denominator summed from highest level of sector grouped by location short_length = min(df[sectorcolumn].apply(lambda x: len(str(x))).unique()) # want to create denominator based on short_length - 1, unless short_length = 2 denom_df = df.loc[df[sectorcolumn].apply(lambda x: len(x) == short_length)] if (level_of_aggregation == 'disaggregated') & (short_length != 2): short_length = short_length - 1 denom_df.loc[:, 'sec_tmp'] = denom_df[sectorcolumn].apply(lambda x: x[0:short_length]) denom_df.loc[:, 'Denominator'] = denom_df.groupby(['Location', 'sec_tmp'])['FlowAmount'].transform('sum') else: # short_length == 2:] denom_df.loc[:, 'Denominator'] = denom_df['FlowAmount'] denom_df.loc[:, 'sec_tmp'] = denom_df[sectorcolumn] # if short_length == 2: # denom_df.loc[:, 'Denominator'] = denom_df['FlowAmount'] # denom_df.loc[:, 'sec_tmp'] = denom_df[sectorcolumn] # else: # short_length = short_length - 1 # denom_df.loc[:, 'sec_tmp'] = denom_df[sectorcolumn].apply(lambda x: x[0:short_length]) # denom_df.loc[:, 'Denominator'] = denom_df.groupby(['Location', 'sec_tmp'])['FlowAmount'].transform('sum') denom_df_2 = denom_df[['Location', 'LocationSystem', 'Year', 'sec_tmp', 'Denominator']].drop_duplicates() # merge the denominator column with fba_w_sector df df.loc[:, 'sec_tmp'] = df[sectorcolumn].apply(lambda x: x[0:short_length]) allocation_df = df.merge(denom_df_2, how='left', left_on=['Location', 'LocationSystem', 'Year', 'sec_tmp'], right_on=['Location', 'LocationSystem', 'Year', 'sec_tmp']) # calculate ratio allocation_df.loc[:, 'FlowAmountRatio'] = allocation_df['FlowAmount'] / allocation_df[ 'Denominator'] allocation_df = allocation_df.drop(columns=['Denominator', 'sec_tmp']).reset_index(drop=True) return allocation_df
def direct_allocation_method(flow_subset_mapped, k, names, method): """ Directly assign activities to sectors :param flow_subset_mapped: df, FBA with flows converted using fedelemflowlist :param k: str, source name :param names: list, activity names in activity set :param method: dictionary, FBS method yaml :return: df with sector columns """ log.info('Directly assigning activities to sectors') fbs = flow_subset_mapped.copy() # for each activity, if activities are not sector like, check that there is no data loss if load_source_catalog()[k]['sector-like_activities'] is False: activity_list = [] for n in names: log.debug('Checking for %s at %s', n, method['target_sector_level']) fbs_subset = fbs[((fbs[fba_activity_fields[0]] == n) & (fbs[fba_activity_fields[1]] == n)) | (fbs[fba_activity_fields[0]] == n) | (fbs[fba_activity_fields[1]] == n)].reset_index(drop=True) fbs_subset = allocate_dropped_sector_data(fbs_subset, method['target_sector_level']) activity_list.append(fbs_subset) fbs = pd.concat(activity_list, ignore_index=True) return fbs
def add_sectors_to_flowbyactivity(flowbyactivity_df, sectorsourcename=sector_source_name): """ Add Sectors from the Activity fields and mapped them to Sector from the crosswalk. No allocation is performed. :param flowbyactivity_df: A standard flowbyactivity data frame :param sectorsourcename: A sector source name, using package default :return: a df with activity fields mapped to 'sectors' """ mappings = [] # First check if source activities are NAICS like - if so make it into a mapping file cat = load_source_catalog() for s in pd.unique(flowbyactivity_df['SourceName']): src_info = cat[s] # read the pre-determined level of sector aggregation of each crosswalk from the source catalog levelofSectoragg = src_info['sector_aggregation_level'] # if data are provided in NAICS format, use the mastercrosswalk if src_info['sector-like_activities']: cw = load_sector_crosswalk() sectors = cw.loc[:, [sector_source_name]] # Create mapping df that's just the sectors at first mapping = sectors.drop_duplicates() # Add the sector twice as activities so mapping is identical mapping = mapping.assign(Activity=sectors[sector_source_name]) mapping = mapping.rename(columns={sector_source_name: "Sector"}) # add columns so can run expand_naics_list_fxn # if sector-like_activities = True, missing columns, so add mapping['ActivitySourceName'] = s # tmp assignment mapping['SectorType'] = None # Include all digits of naics in mapping, if levelofNAICSagg is specified as "aggregated" if levelofSectoragg == 'aggregated': mapping = expand_naics_list(mapping, sectorsourcename) else: # if source data activities are text strings, call on the manually created source crosswalks mapping = get_activitytosector_mapping(s) # filter by SectorSourceName of interest mapping = mapping[mapping['SectorSourceName'] == sectorsourcename] # drop SectorSourceName mapping = mapping.drop(columns=['SectorSourceName']) # Include all digits of naics in mapping, if levelofNAICSagg is specified as "aggregated" if levelofSectoragg == 'aggregated': mapping = expand_naics_list(mapping, sectorsourcename) mappings.append(mapping) mappings_df = pd.concat(mappings, sort=False) # Merge in with flowbyactivity by flowbyactivity_wsector_df = flowbyactivity_df for k, v in activity_fields.items(): sector_direction = k flowbyactivity_field = v[0]["flowbyactivity"] flowbysector_field = v[1]["flowbysector"] sector_type_field = sector_direction + 'SectorType' mappings_df_tmp = mappings_df.rename( columns={ 'Activity': flowbyactivity_field, 'Sector': flowbysector_field, 'SectorType': sector_type_field }) # column doesn't exist for sector-like activities, so ignore if error occurs mappings_df_tmp = mappings_df_tmp.drop(columns=['ActivitySourceName'], errors='ignore') # Merge them in. Critical this is a left merge to preserve all unmapped rows flowbyactivity_wsector_df = pd.merge(flowbyactivity_wsector_df, mappings_df_tmp, how='left', on=flowbyactivity_field) flowbyactivity_wsector_df = flowbyactivity_wsector_df.replace( {np.nan: None}) return flowbyactivity_wsector_df
def get_fba_allocation_subset(fba_allocation, source, activitynames, **kwargs): """ Subset the fba allocation data based on NAICS associated with activity :param fba_allocation: :param sourcename: :param activitynames: :param kwargs: can be the mapping file and method of allocation :return: """ # first determine if there are special cases that would modify the typical method of subset # an example of a special case is when the allocation method is 'proportional-flagged' if kwargs != {}: special_case = False if 'flowSubsetMapped' in kwargs: fsm = kwargs['flowSubsetMapped'] if 'allocMethod' in kwargs: am = kwargs['allocMethod'] if am == 'proportional-flagged': special_case = True else: special_case = False # load the source catalog cat = load_source_catalog() src_info = cat[source] if src_info['sector-like_activities'] is False: # read in source crosswalk df = get_activitytosector_mapping(source) sector_source_name = df['SectorSourceName'].all() df = expand_naics_list(df, sector_source_name) # subset source crosswalk to only contain values pertaining to list of activity names df = df.loc[df['Activity'].isin(activitynames)] # turn column of sectors related to activity names into list sector_list = pd.unique(df['Sector']).tolist() # subset fba allocation table to the values in the activity list, based on overlapping sectors if 'Sector' in fba_allocation: fba_allocation_subset = fba_allocation.loc[ fba_allocation['Sector'].isin(sector_list)].reset_index( drop=True) else: fba_allocation_subset = fba_allocation.loc[ (fba_allocation[fbs_activity_fields[0]].isin(sector_list)) | (fba_allocation[fbs_activity_fields[1]].isin(sector_list) )].reset_index(drop=True) else: if 'Sector' in fba_allocation: fba_allocation_subset = fba_allocation.loc[ fba_allocation['Sector'].isin(activitynames)].reset_index( drop=True) elif special_case: # if it is a special case, then base the subset of data on sectors in the sector columns, not on activitynames fsm_sub = fsm.loc[ (fsm[fba_activity_fields[0]].isin(activitynames)) | (fsm[fba_activity_fields[1]].isin(activitynames))].reset_index( drop=True) part1 = fsm_sub[['SectorConsumedBy']] part2 = fsm_sub[['SectorProducedBy']] part1.columns = ['Sector'] part2.columns = ['Sector'] modified_activitynames = pd.concat( [part1, part2], ignore_index=True).drop_duplicates() modified_activitynames = modified_activitynames[ modified_activitynames['Sector'].notnull()] modified_activitynames = modified_activitynames['Sector'].tolist() fba_allocation_subset = fba_allocation.loc[ (fba_allocation[fbs_activity_fields[0]]. isin(modified_activitynames)) | (fba_allocation[fbs_activity_fields[1]]. isin(modified_activitynames))].reset_index(drop=True) else: fba_allocation_subset = fba_allocation.loc[ (fba_allocation[fbs_activity_fields[0]].isin(activitynames)) | (fba_allocation[fbs_activity_fields[1]].isin(activitynames) )].reset_index(drop=True) return fba_allocation_subset
def add_sectors_to_flowbyactivity(flowbyactivity_df, sectorsourcename=sector_source_name, **kwargs): """ Add Sectors from the Activity fields and mapped them to Sector from the crosswalk. No allocation is performed. :param flowbyactivity_df: A standard flowbyactivity data frame :param sectorsourcename: A sector source name, using package default :param kwargs: option to include the parameter 'allocationmethod', which modifies function behavoir if = 'direct' :return: a df with activity fields mapped to 'sectors' """ # First check if source activities are NAICS like - if so make it into a mapping file cat = load_source_catalog() # for s in pd.unique(flowbyactivity_df['SourceName']): s = pd.unique(flowbyactivity_df['SourceName'])[0] # load catalog info for source src_info = cat[s] # if activities are sector-like, check if need to modify mapping if 'modify_sector-like_activities' in src_info: modify_sector_like_activities = src_info[ 'modify_sector-like_activities'] else: modify_sector_like_activities = False # read the pre-determined level of sector aggregation of each crosswalk from the source catalog levelofSectoragg = src_info['sector_aggregation_level'] # if the FBS activity set is 'direct', overwrite the levelofsectoragg, or if specified in fxn call if kwargs != {}: if 'allocationmethod' in kwargs: if kwargs['allocationmethod'] == 'direct': levelofSectoragg = 'disaggregated' if 'overwrite_sectorlevel' in kwargs: levelofSectoragg = kwargs['overwrite_sectorlevel'] # if data are provided in NAICS format, use the mastercrosswalk if src_info[ 'sector-like_activities'] and modify_sector_like_activities is False: cw = load_sector_crosswalk() sectors = cw.loc[:, [sector_source_name]] # Create mapping df that's just the sectors at first mapping = sectors.drop_duplicates() # Add the sector twice as activities so mapping is identical mapping = mapping.assign(Activity=sectors[sector_source_name]) mapping = mapping.rename(columns={sector_source_name: "Sector"}) # add columns so can run expand_naics_list_fxn # if sector-like_activities = True, missing columns, so add mapping['ActivitySourceName'] = s # tmp assignment mapping['SectorType'] = None # Include all digits of naics in mapping, if levelofNAICSagg is specified as "aggregated" if levelofSectoragg == 'aggregated': mapping = expand_naics_list(mapping, sectorsourcename) else: # if source data activities are text strings, or sector-like activities should be modified, \ # call on the manually created source crosswalks mapping = get_activitytosector_mapping(s) # filter by SectorSourceName of interest mapping = mapping[mapping['SectorSourceName'] == sectorsourcename] # drop SectorSourceName mapping = mapping.drop(columns=['SectorSourceName']) # Include all digits of naics in mapping, if levelofNAICSagg is specified as "aggregated" if levelofSectoragg == 'aggregated': mapping = expand_naics_list(mapping, sectorsourcename) # Merge in with flowbyactivity by flowbyactivity_wsector_df = flowbyactivity_df for k, v in activity_fields.items(): sector_direction = k flowbyactivity_field = v[0]["flowbyactivity"] flowbysector_field = v[1]["flowbysector"] sector_type_field = sector_direction + 'SectorType' mappings_df_tmp = mapping.rename( columns={ 'Activity': flowbyactivity_field, 'Sector': flowbysector_field, 'SectorType': sector_type_field }) # column doesn't exist for sector-like activities, so ignore if error occurs mappings_df_tmp = mappings_df_tmp.drop(columns=['ActivitySourceName'], errors='ignore') # Merge them in. Critical this is a left merge to preserve all unmapped rows flowbyactivity_wsector_df = pd.merge(flowbyactivity_wsector_df, mappings_df_tmp, how='left', on=flowbyactivity_field) flowbyactivity_wsector_df = flowbyactivity_wsector_df.replace( {np.nan: None}) # add sector source name flowbyactivity_wsector_df = flowbyactivity_wsector_df.assign( SectorSourceName=sectorsourcename) # if activities are sector-like check that the sectors are in the crosswalk if src_info['sector-like_activities']: flowbyactivity_wsector_df = replace_naics_w_naics_2012( flowbyactivity_wsector_df, sectorsourcename) return flowbyactivity_wsector_df
def main(method_name): """ Creates a flowbysector dataset :param method_name: Name of method corresponding to flowbysector method yaml name :return: flowbysector """ log.info("Initiating flowbysector creation for " + method_name) # call on method method = load_method(method_name) # create dictionary of data and allocation datasets fb = method['source_names'] # Create empty list for storing fbs files fbs_list = [] for k, v in fb.items(): # pull fba data for allocation flows = load_source_dataframe(k, v) if v['data_format'] == 'FBA': # ensure correct datatypes and that all fields exist flows = clean_df(flows, flow_by_activity_fields, fba_fill_na_dict, drop_description=False) # clean up fba, if specified in yaml if v["clean_fba_df_fxn"] != 'None': log.info("Cleaning up " + k + " FlowByActivity") flows = getattr(sys.modules[__name__], v["clean_fba_df_fxn"])(flows) # if activity_sets are specified in a file, call them here if 'activity_set_file' in v: aset_names = pd.read_csv(flowbysectoractivitysetspath + v['activity_set_file'], dtype=str) # create dictionary of allocation datasets for different activities activities = v['activity_sets'] # subset activity data and allocate to sector for aset, attr in activities.items(): # subset by named activities if 'activity_set_file' in v: names = aset_names[aset_names['activity_set'] == aset]['name'] else: names = attr['names'] log.info("Preparing to handle subset of flownames " + ', '.join(map(str, names)) + " in " + k) # subset fba data by activity flows_subset = flows[ (flows[fba_activity_fields[0]].isin(names)) | (flows[fba_activity_fields[1]].isin(names))].reset_index( drop=True) # extract relevant geoscale data or aggregate existing data log.info("Subsetting/aggregating dataframe to " + attr['allocation_from_scale'] + " geoscale") flows_subset_geo = subset_df_by_geoscale( flows_subset, v['geoscale_to_use'], attr['allocation_from_scale']) # Add sectors to df activity, depending on level of specified sector aggregation log.info("Adding sectors to " + k) flow_subset_wsec = add_sectors_to_flowbyactivity( flows_subset_geo, sectorsourcename=method['target_sector_source'], allocationmethod=attr['allocation_method']) # clean up fba with sectors, if specified in yaml if v["clean_fba_w_sec_df_fxn"] != 'None': log.info("Cleaning up " + k + " FlowByActivity with sectors") flow_subset_wsec = getattr(sys.modules[__name__], v["clean_fba_w_sec_df_fxn"])( flow_subset_wsec, attr=attr) # map df to elementary flows log.info("Mapping flows in " + k + ' to federal elementary flow list') if 'fedefl_mapping' in v: mapping_files = v['fedefl_mapping'] else: mapping_files = k flow_subset_mapped = map_elementary_flows( flow_subset_wsec, mapping_files) # clean up mapped fba with sectors, if specified in yaml if "clean_mapped_fba_w_sec_df_fxn" in v: log.info("Cleaning up " + k + " FlowByActivity with sectors") flow_subset_mapped = getattr( sys.modules[__name__], v["clean_mapped_fba_w_sec_df_fxn"])(flow_subset_mapped, attr, method) # if allocation method is "direct", then no need to create alloc ratios, else need to use allocation # dataframe to create sector allocation ratios if attr['allocation_method'] == 'direct': log.info('Directly assigning ' + ', '.join(map(str, names)) + ' to sectors') fbs = flow_subset_mapped.copy() # for each activity, if activities are not sector like, check that there is no data loss if load_source_catalog( )[k]['sector-like_activities'] is False: activity_list = [] for n in names: log.info('Checking for ' + n + ' at ' + method['target_sector_level']) fbs_subset = fbs[( (fbs[fba_activity_fields[0]] == n) & (fbs[fba_activity_fields[1]] == n)) | (fbs[fba_activity_fields[0]] == n) | (fbs[fba_activity_fields[1]] == n )].reset_index(drop=True) fbs_subset = check_if_losing_sector_data( fbs_subset, method['target_sector_level']) activity_list.append(fbs_subset) fbs = pd.concat(activity_list, ignore_index=True) # if allocation method for an activity set requires a specific function due to the complicated nature # of the allocation, call on function here elif attr['allocation_method'] == 'allocation_function': log.info( 'Calling on function specified in method yaml to allocate ' + ', '.join(map(str, names)) + ' to sectors') fbs = getattr(sys.modules[__name__], attr['allocation_source'])( flow_subset_mapped, attr, fbs_list) else: # determine appropriate allocation dataset log.info("Loading allocation flowbyactivity " + attr['allocation_source'] + " for year " + str(attr['allocation_source_year'])) fba_allocation = flowsa.getFlowByActivity( flowclass=[attr['allocation_source_class']], datasource=attr['allocation_source'], years=[attr['allocation_source_year'] ]).reset_index(drop=True) # clean df and harmonize unites fba_allocation = clean_df(fba_allocation, flow_by_activity_fields, fba_fill_na_dict) fba_allocation = harmonize_units(fba_allocation) # check if allocation data exists at specified geoscale to use log.info("Checking if allocation data exists at the " + attr['allocation_from_scale'] + " level") check_if_data_exists_at_geoscale( fba_allocation, attr['allocation_from_scale']) # aggregate geographically to the scale of the flowbyactivty source, if necessary fba_allocation = subset_df_by_geoscale( fba_allocation, attr['allocation_from_scale'], v['geoscale_to_use']) # subset based on yaml settings if attr['allocation_flow'] != 'None': fba_allocation = fba_allocation.loc[ fba_allocation['FlowName'].isin( attr['allocation_flow'])] if attr['allocation_compartment'] != 'None': fba_allocation = fba_allocation.loc[ fba_allocation['Compartment'].isin( attr['allocation_compartment'])] # cleanup the fba allocation df, if necessary if 'clean_allocation_fba' in attr: log.info("Cleaning " + attr['allocation_source']) fba_allocation = getattr(sys.modules[__name__], attr["clean_allocation_fba"])( fba_allocation, attr=attr) # reset index fba_allocation = fba_allocation.reset_index(drop=True) # assign sector to allocation dataset log.info("Adding sectors to " + attr['allocation_source']) fba_allocation_wsec = add_sectors_to_flowbyactivity( fba_allocation, sectorsourcename=method['target_sector_source']) # call on fxn to further clean up/disaggregate the fba allocation data, if exists if 'clean_allocation_fba_w_sec' in attr: log.info("Further disaggregating sectors in " + attr['allocation_source']) fba_allocation_wsec = getattr( sys.modules[__name__], attr["clean_allocation_fba_w_sec"])( fba_allocation_wsec, attr=attr, method=method) # subset fba datasets to only keep the sectors associated with activity subset log.info("Subsetting " + attr['allocation_source'] + " for sectors in " + k) fba_allocation_subset = get_fba_allocation_subset( fba_allocation_wsec, k, names, flowSubsetMapped=flow_subset_mapped, allocMethod=attr['allocation_method']) # if there is an allocation helper dataset, modify allocation df if attr['allocation_helper'] == 'yes': log.info( "Using the specified allocation help for subset of " + attr['allocation_source']) fba_allocation_subset = allocation_helper( fba_allocation_subset, attr, method, v) # create flow allocation ratios for each activity # if load_source_catalog()[k]['sector-like_activities'] flow_alloc_list = [] group_cols = fba_mapped_default_grouping_fields group_cols = [ e for e in group_cols if e not in ('ActivityProducedBy', 'ActivityConsumedBy') ] for n in names: log.info("Creating allocation ratios for " + n) fba_allocation_subset_2 = get_fba_allocation_subset( fba_allocation_subset, k, [n], flowSubsetMapped=flow_subset_mapped, allocMethod=attr['allocation_method']) if len(fba_allocation_subset_2) == 0: log.info("No data found to allocate " + n) else: flow_alloc = allocate_by_sector( fba_allocation_subset_2, k, attr['allocation_source'], attr['allocation_method'], group_cols, flowSubsetMapped=flow_subset_mapped) flow_alloc = flow_alloc.assign(FBA_Activity=n) flow_alloc_list.append(flow_alloc) flow_allocation = pd.concat(flow_alloc_list, ignore_index=True) # generalize activity field names to enable link to main fba source log.info("Generalizing activity columns in subset of " + attr['allocation_source']) flow_allocation = collapse_activity_fields(flow_allocation) # check for issues with allocation ratios check_allocation_ratios(flow_allocation, aset, k, method_name) # create list of sectors in the flow allocation df, drop any rows of data in the flow df that \ # aren't in list sector_list = flow_allocation['Sector'].unique().tolist() # subset fba allocation table to the values in the activity list, based on overlapping sectors flow_subset_mapped = flow_subset_mapped.loc[ (flow_subset_mapped[fbs_activity_fields[0]]. isin(sector_list)) | (flow_subset_mapped[fbs_activity_fields[1]]. isin(sector_list))] # check if fba and allocation dfs have the same LocationSystem log.info( "Checking if flowbyactivity and allocation dataframes use the same location systems" ) check_if_location_systems_match(flow_subset_mapped, flow_allocation) # merge fba df w/flow allocation dataset log.info("Merge " + k + " and subset of " + attr['allocation_source']) fbs = flow_subset_mapped.merge( flow_allocation[[ 'Location', 'Sector', 'FlowAmountRatio', 'FBA_Activity' ]], left_on=[ 'Location', 'SectorProducedBy', 'ActivityProducedBy' ], right_on=['Location', 'Sector', 'FBA_Activity'], how='left') fbs = fbs.merge( flow_allocation[[ 'Location', 'Sector', 'FlowAmountRatio', 'FBA_Activity' ]], left_on=[ 'Location', 'SectorConsumedBy', 'ActivityConsumedBy' ], right_on=['Location', 'Sector', 'FBA_Activity'], how='left') # merge the flowamount columns fbs.loc[:, 'FlowAmountRatio'] = fbs[ 'FlowAmountRatio_x'].fillna(fbs['FlowAmountRatio_y']) # fill null rows with 0 because no allocation info fbs['FlowAmountRatio'] = fbs['FlowAmountRatio'].fillna(0) # check if fba and alloc dfs have data for same geoscales - comment back in after address the 'todo' # log.info("Checking if flowbyactivity and allocation dataframes have data at the same locations") # check_if_data_exists_for_same_geoscales(fbs, k, attr['names']) # drop rows where there is no allocation data fbs = fbs.dropna(subset=['Sector_x', 'Sector_y'], how='all').reset_index() # calculate flow amounts for each sector log.info("Calculating new flow amounts using flow ratios") fbs.loc[:, 'FlowAmount'] = fbs['FlowAmount'] * fbs[ 'FlowAmountRatio'] # drop columns log.info("Cleaning up new flow by sector") fbs = fbs.drop(columns=[ 'Sector_x', 'FlowAmountRatio_x', 'Sector_y', 'FlowAmountRatio_y', 'FlowAmountRatio', 'FBA_Activity_x', 'FBA_Activity_y' ]) # drop rows where flowamount = 0 (although this includes dropping suppressed data) fbs = fbs[fbs['FlowAmount'] != 0].reset_index(drop=True) # define grouping columns dependent on sectors being activity-like or not if load_source_catalog()[k]['sector-like_activities'] is False: groupingcols = fbs_grouping_fields_w_activities groupingdict = flow_by_sector_fields_w_activity else: groupingcols = fbs_default_grouping_fields groupingdict = flow_by_sector_fields # clean df fbs = clean_df(fbs, groupingdict, fbs_fill_na_dict) # aggregate df geographically, if necessary # todo: replace with fxn return_from_scale log.info("Aggregating flowbysector to " + method['target_geoscale'] + " level") if fips_number_key[v['geoscale_to_use']] < fips_number_key[ attr['allocation_from_scale']]: from_scale = v['geoscale_to_use'] else: from_scale = attr['allocation_from_scale'] to_scale = method['target_geoscale'] fbs_geo_agg = agg_by_geoscale(fbs, from_scale, to_scale, groupingcols) # aggregate data to every sector level log.info("Aggregating flowbysector to all sector levels") fbs_sec_agg = sector_aggregation(fbs_geo_agg, groupingcols) # add missing naics5/6 when only one naics5/6 associated with a naics4 fbs_agg = sector_disaggregation(fbs_sec_agg, groupingdict) # check if any sector information is lost before reaching the target sector length, if so, # allocate values equally to disaggregated sectors log.info('Checking for data at ' + method['target_sector_level']) fbs_agg_2 = check_if_losing_sector_data( fbs_agg, method['target_sector_level']) # compare flowbysector with flowbyactivity # todo: modify fxn to work if activities are sector like in df being allocated if load_source_catalog()[k]['sector-like_activities'] is False: check_for_differences_between_fba_load_and_fbs_output( flow_subset_mapped, fbs_agg_2, aset, k, method_name) # return sector level specified in method yaml # load the crosswalk linking sector lengths sector_list = get_sector_list(method['target_sector_level']) # subset df, necessary because not all of the sectors are NAICS and can get duplicate rows fbs_1 = fbs_agg_2.loc[ (fbs_agg_2[fbs_activity_fields[0]].isin(sector_list)) & (fbs_agg_2[fbs_activity_fields[1]].isin(sector_list) )].reset_index(drop=True) fbs_2 = fbs_agg_2.loc[ (fbs_agg_2[fbs_activity_fields[0]].isin(sector_list)) & (fbs_agg_2[fbs_activity_fields[1]].isnull())].reset_index( drop=True) fbs_3 = fbs_agg_2.loc[ (fbs_agg_2[fbs_activity_fields[0]].isnull()) & (fbs_agg_2[fbs_activity_fields[1]].isin(sector_list) )].reset_index(drop=True) fbs_sector_subset = pd.concat([fbs_1, fbs_2, fbs_3]) # drop activity columns fbs_sector_subset = fbs_sector_subset.drop( ['ActivityProducedBy', 'ActivityConsumedBy'], axis=1, errors='ignore') # save comparison of FBA total to FBS total for an activity set compare_fba_load_and_fbs_output_totals(flows_subset_geo, fbs_sector_subset, aset, k, method_name, attr, method, mapping_files) log.info( "Completed flowbysector for activity subset with flows " + ', '.join(map(str, names))) fbs_list.append(fbs_sector_subset) else: # if the loaded flow dt is already in FBS format, append directly to list of FBS log.info("Append " + k + " to FBS list") # ensure correct field datatypes and add any missing fields flows = clean_df(flows, flow_by_sector_fields, fbs_fill_na_dict) fbs_list.append(flows) # create single df of all activities log.info("Concat data for all activities") fbss = pd.concat(fbs_list, ignore_index=True, sort=False) log.info("Clean final dataframe") # aggregate df as activities might have data for the same specified sector length fbss = clean_df(fbss, flow_by_sector_fields, fbs_fill_na_dict) fbss = aggregator(fbss, fbs_default_grouping_fields) # sort df log.info("Sort and store dataframe") # add missing fields, ensure correct data type, reorder columns fbss = fbss.sort_values( ['SectorProducedBy', 'SectorConsumedBy', 'Flowable', 'Context']).reset_index(drop=True) # save parquet file store_flowbysector(fbss, method_name)
def compare_fba_geo_subset_and_fbs_output_totals(fba_load, fbs_load, activity_set, source_name, source_attr, activity_attr, method): """ Function to compare the loaded flowbyactivity total after subsetting by activity and geography with the final flowbysector output total. Not a direct comparison of the loaded FBA because FBAs are modified before being subset by activity. for the target sector level :param fba_load: df, FBA loaded, before being mapped :param fbs_load: df, final FBS df at target sector level :param activity_set: str, activity set :param source_name: str, source name :param source_attr: dictionary, attribute data from method yaml for source data :param activity_attr: dictionary, attribute data from method yaml for activity set :param method: dictionary, FBS method yaml :return: printout data differences between loaded FBA and FBS output totals by location, save results as csv in local directory """ vLog.info('Comparing Flow-By-Activity subset by activity and geography to ' 'the subset Flow-By-Sector FlowAmount total.') # load source catalog cat = load_source_catalog() src_info = cat[source_name] # extract relevant geoscale data or aggregate existing data fba = subset_df_by_geoscale(fba_load, activity_attr['allocation_from_scale'], method['target_geoscale']) if src_info['sector-like_activities']: # if activities are sector-like, run sector aggregation and then # subset df to only keep NAICS2 fba = fba[[ 'Class', 'FlowAmount', 'Unit', 'Context', 'ActivityProducedBy', 'ActivityConsumedBy', 'Location', 'LocationSystem' ]] # rename the activity cols to sector cols for purposes of aggregation fba = fba.rename( columns={ 'ActivityProducedBy': 'SectorProducedBy', 'ActivityConsumedBy': 'SectorConsumedBy' }) group_cols_agg = [ 'Class', 'Context', 'Unit', 'Location', 'LocationSystem', 'SectorProducedBy', 'SectorConsumedBy' ] fba = sector_aggregation(fba, group_cols_agg) # subset fba to only include NAICS2 fba = replace_NoneType_with_empty_cells(fba) fba = fba[fba['SectorConsumedBy'].apply(lambda x: len(x) == 2) | fba['SectorProducedBy'].apply(lambda x: len(x) == 2)] # subset/agg dfs col_subset = [ 'Class', 'FlowAmount', 'Unit', 'Context', 'Location', 'LocationSystem' ] group_cols = ['Class', 'Unit', 'Context', 'Location', 'LocationSystem'] # check units compare_df_units(fba, fbs_load) # fba fba = fba[col_subset] fba_agg = aggregator(fba, group_cols).reset_index(drop=True) fba_agg.rename(columns={ 'FlowAmount': 'FBA_amount', 'Unit': 'FBA_unit' }, inplace=True) # fbs fbs = fbs_load[col_subset] fbs_agg = aggregator(fbs, group_cols) fbs_agg.rename(columns={ 'FlowAmount': 'FBS_amount', 'Unit': 'FBS_unit' }, inplace=True) try: # merge FBA and FBS totals df_merge = fba_agg.merge(fbs_agg, how='left') df_merge['FlowAmount_difference'] = df_merge['FBA_amount'] - df_merge[ 'FBS_amount'] df_merge['Percent_difference'] =\ (df_merge['FlowAmount_difference']/df_merge['FBA_amount']) * 100 # reorder df_merge = df_merge[[ 'Class', 'Context', 'Location', 'LocationSystem', 'FBA_amount', 'FBA_unit', 'FBS_amount', 'FBS_unit', 'FlowAmount_difference', 'Percent_difference' ]] df_merge = replace_NoneType_with_empty_cells(df_merge) # list of contexts and locations context_list = df_merge[['Context', 'Location']].values.tolist() # loop through the contexts and print results of comparison vLog.info( 'Comparing FBA %s %s subset to FBS results. Details in Validation Log', activity_set, source_attr['geoscale_to_use']) for i, j in context_list: df_merge_subset = df_merge[(df_merge['Context'] == i) & ( df_merge['Location'] == j)].reset_index(drop=True) diff_per = df_merge_subset['Percent_difference'][0] if np.isnan(diff_per): vLog.info( 'FlowBySector FlowAmount for %s %s %s ' 'does not exist in the FBS', source_name, activity_set, i) continue # make reporting more manageable if abs(diff_per) > 0.01: diff_per = round(diff_per, 2) else: diff_per = round(diff_per, 6) # diff_units = df_merge_subset['FBS_unit'][0] if diff_per > 0: vLog.info( 'FlowBySector FlowAmount for %s %s %s at %s is %s%% ' 'less than the FlowByActivity FlowAmount', source_name, activity_set, i, j, str(abs(diff_per))) elif diff_per < 0: vLog.info( 'FlowBySector FlowAmount for %s %s %s at %s is %s%% ' 'more than the FlowByActivity FlowAmount', source_name, activity_set, i, j, str(abs(diff_per))) elif diff_per == 0: vLogDetailed.info( 'FlowBySector FlowAmount for %s %s %s at %s is ' 'equal to the FlowByActivity FlowAmount', source_name, activity_set, i, j) # subset the df to include in the validation log # only print rows where the percent difference does not round to 0 df_v = df_merge[df_merge['Percent_difference'].apply( lambda x: round(x, 3) != 0)].reset_index(drop=True) # log output log.info( 'Save the comparison of FlowByActivity load to FlowBySector ' 'total FlowAmounts for %s in validation log file', activity_set) # if df not empty, print, if empty, print string if df_v.empty: vLogDetailed.info('Percent difference for %s all round to 0', activity_set) else: vLogDetailed.info( 'Comparison of FBA load to FBS total ' 'FlowAmounts for %s: ' '\n {}'.format(df_v.to_string()), activity_set) except: vLog.info('Error occurred when comparing total FlowAmounts ' 'for FlowByActivity and FlowBySector')
def compare_activity_to_sector_flowamounts(fba_load, fbs_load, activity_set, source_name, config): """ Function to compare the loaded flowbyactivity with the final flowbysector by activityname (if exists) to target sector level output, checking for data loss :param fba_load: df, FBA loaded and mapped using FEDEFL :param fbs_load: df, final FBS df :param activity_set: str, activity set :param source_name: str, source name :param config: dictionary, method yaml :return: printout data differences between loaded FBA and FBS output, save results as csv in local directory """ if load_source_catalog()[source_name]['sector-like_activities']: vLog.debug( 'Not comparing loaded FlowByActivity to FlowBySector ratios ' 'for a dataset with sector-like activities because if there ' 'are modifications to flowamounts for a sector, then the ' 'ratios will be different') else: # subset fba df fba = fba_load[[ 'Class', 'MetaSources', 'Flowable', 'Unit', 'FlowType', 'ActivityProducedBy', 'ActivityConsumedBy', 'Context', 'Location', 'LocationSystem', 'Year', 'FlowAmount' ]].drop_duplicates().reset_index(drop=True) fba.loc[:, 'Location'] = US_FIPS group_cols = [ 'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit', 'FlowType', 'Context', 'Location', 'LocationSystem', 'Year' ] fba_agg = aggregator(fba, group_cols) fba_agg.rename(columns={'FlowAmount': 'FBA_amount'}, inplace=True) # subset fbs df fbs = fbs_load[[ 'Class', 'SectorSourceName', 'Flowable', 'Unit', 'FlowType', 'SectorProducedBy', 'SectorConsumedBy', 'ActivityProducedBy', 'ActivityConsumedBy', 'Context', 'Location', 'LocationSystem', 'Year', 'FlowAmount' ]].drop_duplicates().reset_index(drop=True) fbs = replace_NoneType_with_empty_cells(fbs) fbs['ProducedLength'] = fbs['SectorProducedBy'].str.len() fbs['ConsumedLength'] = fbs['SectorConsumedBy'].str.len() fbs['SectorLength'] = fbs[['ProducedLength', 'ConsumedLength']].max(axis=1) fbs.loc[:, 'Location'] = US_FIPS group_cols = [ 'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit', 'FlowType', 'Context', 'Location', 'LocationSystem', 'Year', 'SectorLength' ] fbs_agg = aggregator(fbs, group_cols) fbs_agg.rename(columns={'FlowAmount': 'FBS_amount'}, inplace=True) # merge compare 1 and compare 2 df_merge = fba_agg.merge(fbs_agg, left_on=[ 'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit', 'FlowType', 'Context', 'Location', 'LocationSystem', 'Year' ], right_on=[ 'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit', 'FlowType', 'Context', 'Location', 'LocationSystem', 'Year' ], how='left') df_merge['Ratio'] = df_merge['FBS_amount'] / df_merge['FBA_amount'] # reorder df_merge = df_merge[[ 'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit', 'FlowType', 'Context', 'Location', 'LocationSystem', 'Year', 'SectorLength', 'FBA_amount', 'FBS_amount', 'Ratio' ]] # keep onlyrows of specified sector length comparison = df_merge[df_merge['SectorLength'] == sector_level_key[ config['target_sector_level']]].reset_index(drop=True) tolerance = 0.01 comparison2 = comparison[(comparison['Ratio'] < 1 - tolerance) | (comparison['Ratio'] > 1 + tolerance)] if len(comparison2) > 0: vLog.info( 'There are %s combinations of flowable/context/sector length where the ' 'flowbyactivity to flowbysector ratio is less than or greater than 1 by %s', len(comparison2), str(tolerance)) # include df subset in the validation log # only print rows where flowamount ratio is less than 1 (round flowamountratio) df_v = comparison2[comparison2['Ratio'].apply( lambda x: round(x, 3) < 1)].reset_index(drop=True) # save to validation log log.info( 'Save the comparison of FlowByActivity load to FlowBySector ratios ' 'for %s in validation log', activity_set) # if df not empty, print, if empty, print string if df_v.empty: vLogDetailed.info('Ratios for %s all round to 1', activity_set) else: vLogDetailed.info( 'Comparison of FlowByActivity load to FlowBySector ratios for %s: ' '\n {}'.format(df_v.to_string()), activity_set)
def compare_fba_load_and_fbs_output_totals(fba_load, fbs_load, activity_set, source_name, method_name, attr, method, mapping_files): """ Function to compare the loaded flowbyactivity total with the final flowbysector output total :param df: :return: """ from flowsa.flowbyfunctions import subset_df_by_geoscale, sector_aggregation from flowsa.common import load_source_catalog from flowsa.mapping import map_elementary_flows log.info( 'Comparing loaded FlowByActivity FlowAmount total to subset FlowBySector FlowAmount total' ) # load source catalog cat = load_source_catalog() src_info = cat[source_name] # extract relevant geoscale data or aggregate existing data fba = subset_df_by_geoscale(fba_load, attr['allocation_from_scale'], method['target_geoscale']) # map loaded fba fba = map_elementary_flows(fba, mapping_files, keep_unmapped_rows=True) if src_info['sector-like_activities']: # if activities are sector-like, run sector aggregation and then subset df to only keep NAICS2 fba = fba[[ 'Class', 'FlowAmount', 'Unit', 'Context', 'ActivityProducedBy', 'ActivityConsumedBy', 'Location', 'LocationSystem' ]] # rename the activity cols to sector cols for purposes of aggregation fba = fba.rename( columns={ 'ActivityProducedBy': 'SectorProducedBy', 'ActivityConsumedBy': 'SectorConsumedBy' }) group_cols_agg = [ 'Class', 'Context', 'Unit', 'Location', 'LocationSystem', 'SectorProducedBy', 'SectorConsumedBy' ] fba = sector_aggregation(fba, group_cols_agg) # subset fba to only include NAICS2 fba = replace_NoneType_with_empty_cells(fba) fba = fba[fba['SectorConsumedBy'].apply(lambda x: len(x) == 2) | fba['SectorProducedBy'].apply(lambda x: len(x) == 2)] # subset/agg dfs col_subset = [ 'Class', 'FlowAmount', 'Unit', 'Context', 'Location', 'LocationSystem' ] group_cols = ['Class', 'Unit', 'Context', 'Location', 'LocationSystem'] # fba fba = fba[col_subset] fba_agg = aggregator(fba, group_cols).reset_index(drop=True) fba_agg.rename(columns={ 'FlowAmount': 'FBA_amount', 'Unit': 'FBA_unit' }, inplace=True) # fbs fbs = fbs_load[col_subset] fbs_agg = aggregator(fbs, group_cols) fbs_agg.rename(columns={ 'FlowAmount': 'FBS_amount', 'Unit': 'FBS_unit' }, inplace=True) try: # merge FBA and FBS totals df_merge = fba_agg.merge(fbs_agg, how='left') df_merge['FlowAmount_difference'] = df_merge['FBA_amount'] - df_merge[ 'FBS_amount'] df_merge['Percent_difference'] = (df_merge['FlowAmount_difference'] / df_merge['FBA_amount']) * 100 # reorder df_merge = df_merge[[ 'Class', 'Context', 'Location', 'LocationSystem', 'FBA_amount', 'FBA_unit', 'FBS_amount', 'FBS_unit', 'FlowAmount_difference', 'Percent_difference' ]] df_merge = replace_NoneType_with_empty_cells(df_merge) # list of contexts context_list = df_merge['Context'].to_list() # loop through the contexts and print results of comparison for i in context_list: df_merge_subset = df_merge[df_merge['Context'] == i].reset_index( drop=True) diff_per = df_merge_subset['Percent_difference'][0] # make reporting more manageable if abs(diff_per) > 0.001: diff_per = round(diff_per, 2) else: diff_per = round(diff_per, 6) diff_units = df_merge_subset['FBS_unit'][0] if diff_per > 0: log.info('The total FlowBySector FlowAmount for ' + source_name + ' ' + activity_set + ' ' + i + ' is ' + str(abs(diff_per)) + '% less than the total FlowByActivity FlowAmount') else: log.info('The total FlowBySector FlowAmount for ' + source_name + ' ' + activity_set + ' ' + i + ' is ' + str(abs(diff_per)) + '% more than the total FlowByActivity FlowAmount') # save csv to output folder log.info( 'Save the comparison of FlowByActivity load to FlowBySector total FlowAmounts for ' + activity_set + ' in output folder') # output data at all sector lengths df_merge.to_csv(outputpath + "FlowBySectorMethodAnalysis/" + method_name + '_' + source_name + "_FBA_total_to_FBS_total_FlowAmount_comparison_" + activity_set + ".csv", index=False) except: log.info( 'Error occured when comparing total FlowAmounts for FlowByActivity and FlowBySector' ) return None
def estimate_suppressed_data(df, sector_column, naics_level, sourcename): """ Estimate data suppression, by equally allocating parent NAICS values to child NAICS :param df: df with sector columns :param sector_column: str, column to estimate suppressed data for :param naics_level: numeric, indicate at what NAICS length to base estimated suppresed data off (2 - 5) :param sourcename: str, sourcename :return: df, with estimated suppressed data """ # exclude nonsectors df = replace_NoneType_with_empty_cells(df) # find the longest length sector max_length = max(df[sector_column].apply(lambda x: len(str(x))).unique()) # loop through starting at naics_level, use most detailed level possible to save time for i in range(naics_level, max_length): # create df of i length df_x = df.loc[df[sector_column].apply(lambda x: len(x) == i)] # create df of i + 1 length df_y = df.loc[df[sector_column].apply(lambda x: len(x) == i + 1)] # create temp sector columns in df y, that are i digits in length df_y = df_y.assign(s_tmp=df_y[sector_column].apply(lambda x: x[0:i])) # create list of location and temp activity combos that contain a 0 missing_sectors_df = df_y[df_y['FlowAmount'] == 0] missing_sectors_list = missing_sectors_df[['Location', 's_tmp']].drop_duplicates().values.tolist() # subset the y df if len(missing_sectors_list) != 0: # new df of sectors that start with missing sectors. # drop last digit of the sector and sum flows set conditions suppressed_list = [] for q, r, in missing_sectors_list: c1 = df_y['Location'] == q c2 = df_y['s_tmp'] == r # subset data suppressed_list.append(df_y.loc[c1 & c2]) suppressed_sectors = pd.concat(suppressed_list, sort=False, ignore_index=True) # add column of existing allocated data for length of i suppressed_sectors['alloc_flow'] =\ suppressed_sectors.groupby(['Location', 's_tmp'])['FlowAmount'].transform('sum') # subset further so only keep rows of 0 value suppressed_sectors_sub = suppressed_sectors[suppressed_sectors['FlowAmount'] == 0] # add count suppressed_sectors_sub = \ suppressed_sectors_sub.assign(sector_count= suppressed_sectors_sub.groupby( ['Location', 's_tmp'] )['s_tmp'].transform('count')) # merge suppressed sector subset with df x df_m = pd.merge(df_x, suppressed_sectors_sub[['Class', 'Compartment', 'FlowType', 'FlowName', 'Location', 'LocationSystem', 'Unit', 'Year', sector_column, 's_tmp', 'alloc_flow', 'sector_count']], left_on=['Class', 'Compartment', 'FlowType', 'FlowName', 'Location', 'LocationSystem', 'Unit', 'Year', sector_column], right_on=['Class', 'Compartment', 'FlowType', 'FlowName', 'Location', 'LocationSystem', 'Unit', 'Year', 's_tmp'], how='right') # drop any rows where flowamount is none df_m = df_m[~df_m['FlowAmount'].isna()] # calculate estimated flows by subtracting the flow # amount already allocated from total flow of # sector one level up and divide by number of sectors with suppressed data df_m.loc[:, 'FlowAmount'] = \ (df_m['FlowAmount'] - df_m['alloc_flow']) / df_m['sector_count'] # only keep the suppressed sector subset activity columns df_m = df_m.drop(columns=[sector_column + '_x', 's_tmp', 'alloc_flow', 'sector_count']) df_m = df_m.rename(columns={sector_column + '_y': sector_column}) # reset activity columns if load_source_catalog()[sourcename]['sector-like_activities']: df_m = df_m.assign(ActivityProducedBy=df_m['SectorProducedBy']) df_m = df_m.assign(ActivityConsumedBy=df_m['SectorConsumedBy']) # drop the existing rows with suppressed data and append the new estimates from fba df modified_df =\ pd.merge(df, df_m[['FlowName', 'Location', sector_column]], indicator=True, how='outer').query('_merge=="left_only"').drop('_merge', axis=1) df = pd.concat([modified_df, df_m], ignore_index=True) df_w_estimated_data = replace_strings_with_NoneType(df) return df_w_estimated_data
def sector_aggregation(df_load, group_cols): """ Function that checks if a sector length exists, and if not, sums the less aggregated sector :param df_load: Either a flowbyactivity df with sectors or a flowbysector df :param group_cols: columns by which to aggregate :return: df, with aggregated sector values """ # determine if activities are sector-like, if aggregating a df with a 'SourceName' sector_like_activities = False if 'SourceName' in df_load.columns: # load source catalog cat = load_source_catalog() # for s in pd.unique(flowbyactivity_df['SourceName']): s = pd.unique(df_load['SourceName'])[0] # load catalog info for source src_info = cat[s] sector_like_activities = src_info['sector-like_activities'] # ensure None values are not strings df = replace_NoneType_with_empty_cells(df_load) # if activities are source like, drop from df and group calls, # add back in as copies of sector columns columns to keep if sector_like_activities: group_cols = [e for e in group_cols if e not in ('ActivityProducedBy', 'ActivityConsumedBy')] # subset df df_cols = [e for e in df.columns if e not in ('ActivityProducedBy', 'ActivityConsumedBy')] df = df[df_cols] # find the longest length sector length = df[[fbs_activity_fields[0], fbs_activity_fields[1]]].apply( lambda x: x.str.len()).max().max() length = int(length) # for loop in reverse order longest length naics minus 1 to 2 # appends missing naics levels to df for i in range(length - 1, 1, -1): # subset df to sectors with length = i and length = i + 1 df_subset = df.loc[df[fbs_activity_fields[0]].apply(lambda x: i + 1 >= len(x) >= i) | df[fbs_activity_fields[1]].apply(lambda x: i + 1 >= len(x) >= i)] # create a list of i digit sectors in df subset sector_subset = df_subset[ ['Location', fbs_activity_fields[0], fbs_activity_fields[1]]].drop_duplicates().reset_index(drop=True) df_sectors = sector_subset.copy() df_sectors.loc[:, 'SectorProducedBy'] = \ df_sectors['SectorProducedBy'].apply(lambda x: x[0:i]) df_sectors.loc[:, 'SectorConsumedBy'] = \ df_sectors['SectorConsumedBy'].apply(lambda x: x[0:i]) sector_list = df_sectors.drop_duplicates().values.tolist() # create a list of sectors that are exactly i digits long # where either sector column is i digits in length df_existing_1 = \ sector_subset.loc[(sector_subset['SectorProducedBy'].apply(lambda x: len(x) == i)) | (sector_subset['SectorConsumedBy'].apply(lambda x: len(x) == i))] # where both sector columns are i digits in length df_existing_2 = \ sector_subset.loc[(sector_subset['SectorProducedBy'].apply(lambda x: len(x) == i)) & (sector_subset['SectorConsumedBy'].apply(lambda x: len(x) == i))] # concat existing dfs df_existing = pd.concat([df_existing_1, df_existing_2], sort=False) existing_sectors = df_existing.drop_duplicates().dropna().values.tolist() # list of sectors of length i that are not in sector list missing_sectors = [e for e in sector_list if e not in existing_sectors] if len(missing_sectors) != 0: # new df of sectors that start with missing sectors. # drop last digit of the sector and sum flows # set conditions agg_sectors_list = [] for q, r, s in missing_sectors: c1 = df_subset['Location'] == q c2 = df_subset[fbs_activity_fields[0]].apply(lambda x: x[0:i] == r) c3 = df_subset[fbs_activity_fields[1]].apply(lambda x: x[0:i] == s) # subset data agg_sectors_list.append(df_subset.loc[c1 & c2 & c3]) agg_sectors = pd.concat(agg_sectors_list, sort=False) agg_sectors = agg_sectors.loc[ (agg_sectors[fbs_activity_fields[0]].apply(lambda x: len(x) > i)) | (agg_sectors[fbs_activity_fields[1]].apply(lambda x: len(x) > i))] agg_sectors.loc[:, fbs_activity_fields[0]] = agg_sectors[fbs_activity_fields[0]].apply( lambda x: x[0:i]) agg_sectors.loc[:, fbs_activity_fields[1]] = agg_sectors[fbs_activity_fields[1]].apply( lambda x: x[0:i]) # aggregate the new sector flow amounts agg_sectors = aggregator(agg_sectors, group_cols) # append to df agg_sectors = replace_NoneType_with_empty_cells(agg_sectors) df = df.append(agg_sectors, sort=False).reset_index(drop=True) # manually modify non-NAICS codes that might exist in sector df.loc[:, 'SectorConsumedBy'] = np.where(df['SectorConsumedBy'].isin(['F0', 'F01']), 'F010', df['SectorConsumedBy']) # domestic/household df.loc[:, 'SectorProducedBy'] = np.where(df['SectorProducedBy'].isin(['F0', 'F01']), 'F010', df['SectorProducedBy']) # domestic/household # drop any duplicates created by modifying sector codes df = df.drop_duplicates() # if activities are source-like, set col values as copies of the sector columns if sector_like_activities: df = df.assign(ActivityProducedBy=df['SectorProducedBy']) df = df.assign(ActivityConsumedBy=df['SectorConsumedBy']) # reindex columns df = df.reindex(df_load.columns, axis=1) # replace null values df = replace_strings_with_NoneType(df) return df
def main(**kwargs): """ Creates a flowbysector dataset :param kwargs: dictionary of arguments, only argument is "method_name", the name of method corresponding to flowbysector method yaml name :return: parquet, FBS save to local folder """ if len(kwargs) == 0: kwargs = parse_args() method_name = kwargs['method'] # assign arguments vLog.info("Initiating flowbysector creation for %s", method_name) # call on method method = load_method(method_name) # create dictionary of data and allocation datasets fb = method['source_names'] # Create empty list for storing fbs files fbs_list = [] for k, v in fb.items(): # pull fba data for allocation flows = load_source_dataframe(k, v) if v['data_format'] == 'FBA': # ensure correct datatypes and that all fields exist flows = clean_df(flows, flow_by_activity_fields, fba_fill_na_dict, drop_description=False) # map flows to federal flow list or material flow list flows_mapped, mapping_files = map_fbs_flows(flows, k, v, keep_fba_columns=True) # clean up fba, if specified in yaml if "clean_fba_df_fxn" in v: vLog.info("Cleaning up %s FlowByActivity", k) flows_mapped = dynamically_import_fxn( k, v["clean_fba_df_fxn"])(flows_mapped) # if activity_sets are specified in a file, call them here if 'activity_set_file' in v: aset_names = pd.read_csv(flowbysectoractivitysetspath + v['activity_set_file'], dtype=str) else: aset_names = None # create dictionary of allocation datasets for different activities activities = v['activity_sets'] # subset activity data and allocate to sector for aset, attr in activities.items(): # subset by named activities if 'activity_set_file' in v: names = aset_names[aset_names['activity_set'] == aset]['name'] else: names = attr['names'] vLog.info("Preparing to handle %s in %s", aset, k) # subset fba data by activity flows_subset =\ flows_mapped[(flows_mapped[fba_activity_fields[0]].isin(names)) | (flows_mapped[fba_activity_fields[1]].isin(names) )].reset_index(drop=True) # if activities are sector-like, check sectors are valid if load_source_catalog()[k]['sector-like_activities']: flows_subset2 =\ replace_naics_w_naics_from_another_year(flows_subset, method['target_sector_source']) # check impact on df FlowAmounts vLog.info( 'Calculate FlowAmount difference caused by ' 'replacing NAICS Codes with %s, saving difference in Validation log', method['target_sector_source'], ) calculate_flowamount_diff_between_dfs( flows_subset, flows_subset2) else: flows_subset2 = flows_subset.copy() # extract relevant geoscale data or aggregate existing data flows_subset_geo = subset_df_by_geoscale( flows_subset2, v['geoscale_to_use'], attr['allocation_from_scale']) # if loading data subnational geoscale, check for data loss if attr['allocation_from_scale'] != 'national': compare_geographic_totals(flows_subset_geo, flows_mapped, k, attr, aset, names) # Add sectors to df activity, depending on level of specified sector aggregation log.info("Adding sectors to %s", k) flows_subset_wsec =\ add_sectors_to_flowbyactivity(flows_subset_geo, sectorsourcename=method['target_sector_source'], allocationmethod=attr['allocation_method']) # clean up fba with sectors, if specified in yaml if "clean_fba_w_sec_df_fxn" in v: vLog.info("Cleaning up %s FlowByActivity with sectors", k) flows_subset_wsec = \ dynamically_import_fxn(k, v["clean_fba_w_sec_df_fxn"])(flows_subset_wsec, attr=attr, method=method) # rename SourceName to MetaSources and drop columns flows_mapped_wsec = flows_subset_wsec.\ rename(columns={'SourceName': 'MetaSources'}).\ drop(columns=['FlowName', 'Compartment']) # if allocation method is "direct", then no need to create alloc ratios, # else need to use allocation # dataframe to create sector allocation ratios if attr['allocation_method'] == 'direct': fbs = direct_allocation_method(flows_mapped_wsec, k, names, method) # if allocation method for an activity set requires a specific # function due to the complicated nature # of the allocation, call on function here elif attr['allocation_method'] == 'allocation_function': fbs = function_allocation_method(flows_mapped_wsec, k, names, attr, fbs_list) else: fbs =\ dataset_allocation_method(flows_mapped_wsec, attr, names, method, k, v, aset, method_name, aset_names) # drop rows where flowamount = 0 (although this includes dropping suppressed data) fbs = fbs[fbs['FlowAmount'] != 0].reset_index(drop=True) # define grouping columns dependent on sectors being activity-like or not if load_source_catalog()[k]['sector-like_activities'] is False: groupingcols = fbs_grouping_fields_w_activities groupingdict = flow_by_sector_fields_w_activity else: groupingcols = fbs_default_grouping_fields groupingdict = flow_by_sector_fields # clean df fbs = clean_df(fbs, groupingdict, fbs_fill_na_dict) # aggregate df geographically, if necessary log.info("Aggregating flowbysector to %s level", method['target_geoscale']) # determine from scale if fips_number_key[v['geoscale_to_use']] <\ fips_number_key[attr['allocation_from_scale']]: from_scale = v['geoscale_to_use'] else: from_scale = attr['allocation_from_scale'] fbs_geo_agg = agg_by_geoscale(fbs, from_scale, method['target_geoscale'], groupingcols) # aggregate data to every sector level log.info("Aggregating flowbysector to all sector levels") fbs_sec_agg = sector_aggregation(fbs_geo_agg, groupingcols) # add missing naics5/6 when only one naics5/6 associated with a naics4 fbs_agg = sector_disaggregation(fbs_sec_agg) # check if any sector information is lost before reaching # the target sector length, if so, # allocate values equally to disaggregated sectors vLog.info( 'Searching for and allocating FlowAmounts for any parent ' 'NAICS that were dropped in the subset to ' '%s child NAICS', method['target_sector_level']) fbs_agg_2 = allocate_dropped_sector_data( fbs_agg, method['target_sector_level']) # compare flowbysector with flowbyactivity compare_activity_to_sector_flowamounts(flows_mapped_wsec, fbs_agg_2, aset, k, method) # return sector level specified in method yaml # load the crosswalk linking sector lengths sector_list = get_sector_list(method['target_sector_level']) # subset df, necessary because not all of the sectors are # NAICS and can get duplicate rows fbs_1 = fbs_agg_2.loc[(fbs_agg_2[fbs_activity_fields[0]].isin(sector_list)) & (fbs_agg_2[fbs_activity_fields[1]].isin(sector_list))].\ reset_index(drop=True) fbs_2 = fbs_agg_2.loc[(fbs_agg_2[fbs_activity_fields[0]].isin(sector_list)) & (fbs_agg_2[fbs_activity_fields[1]].isnull())].\ reset_index(drop=True) fbs_3 = fbs_agg_2.loc[(fbs_agg_2[fbs_activity_fields[0]].isnull()) & (fbs_agg_2[fbs_activity_fields[1]].isin(sector_list))].\ reset_index(drop=True) fbs_sector_subset = pd.concat([fbs_1, fbs_2, fbs_3]) # drop activity columns fbs_sector_subset = fbs_sector_subset.drop( ['ActivityProducedBy', 'ActivityConsumedBy'], axis=1, errors='ignore') # save comparison of FBA total to FBS total for an activity set compare_fba_geo_subset_and_fbs_output_totals( flows_subset_geo, fbs_sector_subset, aset, k, v, attr, method) log.info("Completed flowbysector for %s", aset) fbs_list.append(fbs_sector_subset) else: # if the loaded flow dt is already in FBS format, append directly to list of FBS log.info("Append %s to FBS list", k) # ensure correct field datatypes and add any missing fields flows = clean_df(flows, flow_by_sector_fields, fbs_fill_na_dict) fbs_list.append(flows) # create single df of all activities log.info("Concat data for all activities") fbss = pd.concat(fbs_list, ignore_index=True, sort=False) log.info("Clean final dataframe") # add missing fields, ensure correct data type, add missing columns, reorder columns fbss = clean_df(fbss, flow_by_sector_fields, fbs_fill_na_dict) # prior to aggregating, replace MetaSources string with all sources # that share context/flowable/sector values fbss = harmonize_FBS_columns(fbss) # aggregate df as activities might have data for the same specified sector length fbss = aggregator(fbss, fbs_default_grouping_fields) # sort df log.info("Sort and store dataframe") # ensure correct data types/order of columns fbss = clean_df(fbss, flow_by_sector_fields, fbs_fill_na_dict) fbss = fbss.sort_values( ['SectorProducedBy', 'SectorConsumedBy', 'Flowable', 'Context']).reset_index(drop=True) # tmp reset data quality scores fbss = reset_fbs_dq_scores(fbss) # save parquet file meta = set_fb_meta(method_name, "FlowBySector") write_df_to_file(fbss, paths, meta) write_metadata(method_name, method, meta, "FlowBySector") # rename the log file saved to local directory rename_log_file(method_name, meta) log.info( 'See the Validation log for detailed assessment of model results in %s', logoutputpath)
def get_fba_allocation_subset(fba_allocation, source, activitynames, **kwargs): """ Subset the fba allocation data based on NAICS associated with activity :param fba_allocation: df, FBA format :param source: str, source name :param activitynames: list, activity names in activity set :param kwargs: can be the mapping file and method of allocation :return: df, FBA subset """ # first determine if there are special cases that would modify the typical method of subset # an example of a special case is when the allocation method is 'proportional-flagged' subset_by_sector_cols = False subset_by_column_value = False if kwargs != {}: if 'flowSubsetMapped' in kwargs: fsm = kwargs['flowSubsetMapped'] if 'allocMethod' in kwargs: am = kwargs['allocMethod'] if am == 'proportional-flagged': subset_by_sector_cols = True if 'activity_set_names' in kwargs: asn = kwargs['activity_set_names'] if asn is not None: if 'allocation_subset_col' in asn: subset_by_column_value = True # load the source catalog cat = load_source_catalog() src_info = cat[source] if src_info['sector-like_activities'] is False: # read in source crosswalk df = get_activitytosector_mapping(source) sec_source_name = df['SectorSourceName'][0] df = expand_naics_list(df, sec_source_name) # subset source crosswalk to only contain values pertaining to list of activity names df = df.loc[df['Activity'].isin(activitynames)] # turn column of sectors related to activity names into list sector_list = pd.unique(df['Sector']).tolist() # subset fba allocation table to the values in # the activity list, based on overlapping sectors if 'Sector' in fba_allocation: fba_allocation_subset =\ fba_allocation.loc[fba_allocation['Sector'].isin( sector_list)].reset_index(drop=True) else: fba_allocation_subset = \ fba_allocation.loc[(fba_allocation[fbs_activity_fields[0]].isin(sector_list)) | (fba_allocation[fbs_activity_fields[1]].isin(sector_list))]. \ reset_index(drop=True) else: if 'Sector' in fba_allocation: fba_allocation_subset =\ fba_allocation.loc[fba_allocation['Sector'].isin( activitynames)].reset_index(drop=True) elif subset_by_sector_cols: # if it is a special case, then base the subset of data on # sectors in the sector columns, not on activitynames fsm_sub = fsm.loc[ (fsm[fba_activity_fields[0]].isin(activitynames)) | (fsm[fba_activity_fields[1]].isin(activitynames))].reset_index( drop=True) part1 = fsm_sub[['SectorConsumedBy']] part2 = fsm_sub[['SectorProducedBy']] part1.columns = ['Sector'] part2.columns = ['Sector'] modified_activitynames = pd.concat( [part1, part2], ignore_index=True).drop_duplicates() modified_activitynames =\ modified_activitynames[modified_activitynames['Sector'].notnull()] modified_activitynames = modified_activitynames['Sector'].tolist() fba_allocation_subset = \ fba_allocation.loc[ (fba_allocation[fbs_activity_fields[0]].isin(modified_activitynames)) | (fba_allocation[fbs_activity_fields[1]].isin(modified_activitynames))]. \ reset_index(drop=True) else: fba_allocation_subset =\ fba_allocation.loc[(fba_allocation[fbs_activity_fields[0]].isin(activitynames)) | (fba_allocation[fbs_activity_fields[1]].isin(activitynames))].\ reset_index(drop=True) # if activity set names included in function call and activity set names is not null, \ # then subset data based on value and column specified if subset_by_column_value: # create subset of activity names and allocation subset metrics asn_subset = asn[asn['name'].isin(activitynames)].reset_index( drop=True) if asn_subset['allocation_subset'].isna().all(): pass elif asn_subset['allocation_subset'].isna().any(): log.error( 'Define column and value to subset on in the activity set csv for all rows' ) else: col_to_subset = asn_subset['allocation_subset_col'][0] val_to_subset = asn_subset['allocation_subset'][0] # subset fba_allocation_subset further log.debug('Subset the allocation dataset where %s = %s', str(col_to_subset), str(val_to_subset)) fba_allocation_subset = fba_allocation_subset[ fba_allocation_subset[col_to_subset] == val_to_subset].reset_index(drop=True) return fba_allocation_subset