def direct_allocation_method(fbs, k, names, method): """ Directly assign activities to sectors :param fbs: df, FBA with flows converted using fedelemflowlist :param k: str, source name :param names: list, activity names in activity set :param method: dictionary, FBS method yaml :return: df with sector columns """ log.info('Directly assigning activities to sectors') # for each activity, if activities are not sector like, # check that there is no data loss if check_activities_sector_like(k) is False: activity_list = [] n_allocated = [] for n in names: # avoid double counting by dropping n from the df after calling on # n, in the event both ACB and APB values exist fbs = fbs[~( (fbs[fba_activity_fields[0]].isin(n_allocated)) | (fbs[fba_activity_fields[1]].isin(n_allocated)))].reset_index( drop=True) log.debug('Checking for %s at %s', n, method['target_sector_level']) fbs_subset = \ fbs[(fbs[fba_activity_fields[0]] == n) | (fbs[fba_activity_fields[1]] == n)].reset_index(drop=True) # check if an Activity maps to more than one sector, # if so, equally allocate fbs_subset = equal_allocation(fbs_subset) fbs_subset = equally_allocate_parent_to_child_naics( fbs_subset, method['target_sector_level']) activity_list.append(fbs_subset) n_allocated.append(n) fbs = pd.concat(activity_list, ignore_index=True) return fbs
def compare_fba_geo_subset_and_fbs_output_totals(fba_load, fbs_load, activity_set, source_name, source_attr, activity_attr, method): """ Function to compare the loaded flowbyactivity total after subsetting by activity and geography with the final flowbysector output total. Not a direct comparison of the loaded FBA because FBAs are modified before being subset by activity for the target sector level :param fba_load: df, FBA loaded, before being mapped :param fbs_load: df, final FBS df at target sector level :param activity_set: str, activity set :param source_name: str, source name :param source_attr: dictionary, attribute data from method yaml for source data :param activity_attr: dictionary, attribute data from method yaml for activity set :param method: dictionary, FBS method yaml :return: printout data differences between loaded FBA and FBS output totals by location, save results as csv in local directory """ vLog.info('Comparing Flow-By-Activity subset by activity and geography to ' 'the subset Flow-By-Sector FlowAmount total.') # determine from scale if fips_number_key[source_attr['geoscale_to_use']] < \ fips_number_key[activity_attr['allocation_from_scale']]: from_scale = source_attr['geoscale_to_use'] else: from_scale = activity_attr['allocation_from_scale'] # extract relevant geoscale data or aggregate existing data fba = subset_df_by_geoscale(fba_load, from_scale, method['target_geoscale']) if check_activities_sector_like(source_name): # if activities are sector-like, run sector aggregation and then # subset df to only keep NAICS2 fba = fba[[ 'Class', 'FlowAmount', 'Unit', 'Context', 'ActivityProducedBy', 'ActivityConsumedBy', 'Location', 'LocationSystem' ]] # rename the activity cols to sector cols for purposes of aggregation fba = fba.rename( columns={ 'ActivityProducedBy': 'SectorProducedBy', 'ActivityConsumedBy': 'SectorConsumedBy' }) group_cols_agg = [ 'Class', 'Context', 'Unit', 'Location', 'LocationSystem', 'SectorProducedBy', 'SectorConsumedBy' ] fba = sector_aggregation(fba, group_cols_agg) # subset fba to only include NAICS2 fba = replace_NoneType_with_empty_cells(fba) fba = fba[fba['SectorConsumedBy'].apply(lambda x: len(x) == 2) | fba['SectorProducedBy'].apply(lambda x: len(x) == 2)] # subset/agg dfs col_subset = [ 'Class', 'FlowAmount', 'Unit', 'Context', 'Location', 'LocationSystem' ] group_cols = ['Class', 'Unit', 'Context', 'Location', 'LocationSystem'] # check units compare_df_units(fba, fbs_load) # fba fba = fba[col_subset] fba_agg = aggregator(fba, group_cols).reset_index(drop=True) fba_agg.rename(columns={ 'FlowAmount': 'FBA_amount', 'Unit': 'FBA_unit' }, inplace=True) # fbs fbs = fbs_load[col_subset] fbs_agg = aggregator(fbs, group_cols) fbs_agg.rename(columns={ 'FlowAmount': 'FBS_amount', 'Unit': 'FBS_unit' }, inplace=True) try: # merge FBA and FBS totals df_merge = fba_agg.merge(fbs_agg, how='left') df_merge['FlowAmount_difference'] = \ df_merge['FBA_amount'] - df_merge['FBS_amount'] df_merge['Percent_difference'] = \ (df_merge['FlowAmount_difference']/df_merge['FBA_amount']) * 100 # reorder df_merge = df_merge[[ 'Class', 'Context', 'Location', 'LocationSystem', 'FBA_amount', 'FBA_unit', 'FBS_amount', 'FBS_unit', 'FlowAmount_difference', 'Percent_difference' ]] df_merge = replace_NoneType_with_empty_cells(df_merge) # list of contexts and locations context_list = df_merge[['Context', 'Location']].values.tolist() # loop through the contexts and print results of comparison vLog.info( 'Comparing FBA %s %s subset to FBS results. ' 'Details in Validation Log', activity_set, source_attr['geoscale_to_use']) for i, j in context_list: df_merge_subset = \ df_merge[(df_merge['Context'] == i) & (df_merge['Location'] == j)].reset_index(drop=True) diff_per = df_merge_subset['Percent_difference'][0] if np.isnan(diff_per): vLog.info( 'FlowBySector FlowAmount for %s %s %s ' 'does not exist in the FBS', source_name, activity_set, i) continue # make reporting more manageable if abs(diff_per) > 0.01: diff_per = round(diff_per, 2) else: diff_per = round(diff_per, 6) # diff_units = df_merge_subset['FBS_unit'][0] if diff_per > 0: vLog.info( 'FlowBySector FlowAmount for %s %s %s at %s is %s%% ' 'less than the FlowByActivity FlowAmount', source_name, activity_set, i, j, str(abs(diff_per))) elif diff_per < 0: vLog.info( 'FlowBySector FlowAmount for %s %s %s at %s is %s%% ' 'more than the FlowByActivity FlowAmount', source_name, activity_set, i, j, str(abs(diff_per))) elif diff_per == 0: vLogDetailed.info( 'FlowBySector FlowAmount for ' '%s %s %s at %s is equal to the ' 'FlowByActivity FlowAmount', source_name, activity_set, i, j) # subset the df to include in the validation log # only print rows where the percent difference does not round to 0 df_v = df_merge[df_merge['Percent_difference'].apply( lambda x: round(x, 3) != 0)].reset_index(drop=True) # log output log.info( 'Save the comparison of FlowByActivity load to FlowBySector ' 'total FlowAmounts for %s in validation log file', activity_set) # if df not empty, print, if empty, print string if df_v.empty: vLogDetailed.info('Percent difference for %s all round to 0', activity_set) else: vLogDetailed.info( 'Comparison of FBA load to FBS total ' 'FlowAmounts for %s: ' '\n {}'.format(df_v.to_string()), activity_set) except: vLog.info('Error occurred when comparing total FlowAmounts ' 'for FlowByActivity and FlowBySector')
def compare_activity_to_sector_flowamounts(fba_load, fbs_load, activity_set, source_name, config): """ Function to compare the loaded flowbyactivity with the final flowbysector by activityname (if exists) to target sector level output, checking for data loss :param fba_load: df, FBA loaded and mapped using FEDEFL :param fbs_load: df, final FBS df :param activity_set: str, activity set :param source_name: str, source name :param config: dictionary, method yaml :return: printout data differences between loaded FBA and FBS output, save results as csv in local directory """ if check_activities_sector_like(source_name): vLog.debug('Not comparing loaded FlowByActivity to FlowBySector ' 'ratios for a dataset with sector-like activities because ' 'if there are modifications to flowamounts for a sector, ' 'then the ratios will be different') else: # subset fba df fba = fba_load[[ 'Class', 'MetaSources', 'Flowable', 'Unit', 'FlowType', 'ActivityProducedBy', 'ActivityConsumedBy', 'Context', 'Location', 'LocationSystem', 'Year', 'FlowAmount' ]].drop_duplicates().reset_index(drop=True) fba.loc[:, 'Location'] = US_FIPS group_cols = [ 'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit', 'FlowType', 'Context', 'Location', 'LocationSystem', 'Year' ] fba_agg = aggregator(fba, group_cols) fba_agg.rename(columns={'FlowAmount': 'FBA_amount'}, inplace=True) # subset fbs df fbs = fbs_load[[ 'Class', 'SectorSourceName', 'Flowable', 'Unit', 'FlowType', 'SectorProducedBy', 'SectorConsumedBy', 'ActivityProducedBy', 'ActivityConsumedBy', 'Context', 'Location', 'LocationSystem', 'Year', 'FlowAmount' ]].drop_duplicates().reset_index(drop=True) fbs = replace_NoneType_with_empty_cells(fbs) fbs['ProducedLength'] = fbs['SectorProducedBy'].str.len() fbs['ConsumedLength'] = fbs['SectorConsumedBy'].str.len() fbs['SectorLength'] = fbs[['ProducedLength', 'ConsumedLength']].max(axis=1) fbs.loc[:, 'Location'] = US_FIPS group_cols = [ 'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit', 'FlowType', 'Context', 'Location', 'LocationSystem', 'Year', 'SectorLength' ] fbs_agg = aggregator(fbs, group_cols) fbs_agg.rename(columns={'FlowAmount': 'FBS_amount'}, inplace=True) # merge compare 1 and compare 2 df_merge = fba_agg.merge(fbs_agg, left_on=[ 'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit', 'FlowType', 'Context', 'Location', 'LocationSystem', 'Year' ], right_on=[ 'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit', 'FlowType', 'Context', 'Location', 'LocationSystem', 'Year' ], how='left') df_merge['Ratio'] = df_merge['FBS_amount'] / df_merge['FBA_amount'] # reorder df_merge = df_merge[[ 'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit', 'FlowType', 'Context', 'Location', 'LocationSystem', 'Year', 'SectorLength', 'FBA_amount', 'FBS_amount', 'Ratio' ]] # keep onlyrows of specified sector length comparison = df_merge[df_merge['SectorLength'] == sector_level_key[ config['target_sector_level']]].reset_index(drop=True) tolerance = 0.01 comparison2 = comparison[(comparison['Ratio'] < 1 - tolerance) | (comparison['Ratio'] > 1 + tolerance)] if len(comparison2) > 0: vLog.info( 'There are %s combinations of flowable/context/sector ' 'length where the flowbyactivity to flowbysector ratio ' 'is less than or greater than 1 by %s', len(comparison2), str(tolerance)) # include df subset in the validation log # only print rows where flowamount ratio is less t # han 1 (round flowamountratio) df_v = comparison2[comparison2['Ratio'].apply( lambda x: round(x, 3) < 1)].reset_index(drop=True) # save to validation log log.info( 'Save the comparison of FlowByActivity load ' 'to FlowBySector ratios for %s in validation log', activity_set) # if df not empty, print, if empty, print string if df_v.empty: vLogDetailed.info('Ratios for %s all round to 1', activity_set) else: vLogDetailed.info( 'Comparison of FlowByActivity load to ' 'FlowBySector ratios for %s: ' '\n {}'.format(df_v.to_string()), activity_set)
def main(**kwargs): """ Creates a flowbysector dataset :param kwargs: dictionary of arguments, only argument is "method_name", the name of method corresponding to flowbysector method yaml name :return: parquet, FBS save to local folder """ if len(kwargs) == 0: kwargs = parse_args() method_name = kwargs['method'] download_FBA_if_missing = kwargs.get('download_FBAs_if_missing') # assign arguments vLog.info("Initiating flowbysector creation for %s", method_name) # call on method method = load_yaml_dict(method_name, flowbytype='FBS') # create dictionary of data and allocation datasets fb = method['source_names'] # Create empty list for storing fbs files fbs_list = [] for k, v in fb.items(): # pull fba data for allocation flows = load_source_dataframe(k, v, download_FBA_if_missing) if v['data_format'] == 'FBA': # ensure correct datatypes and that all fields exist flows = clean_df(flows, flow_by_activity_fields, fba_fill_na_dict, drop_description=False) # clean up fba before mapping, if specified in yaml if "clean_fba_before_mapping_df_fxn" in v: vLog.info("Cleaning up %s FlowByActivity", k) flows = dynamically_import_fxn( k, v["clean_fba_before_mapping_df_fxn"])(flows) # map flows to federal flow list or material flow list flows_mapped, mapping_files = \ map_fbs_flows(flows, k, v, keep_fba_columns=True) # clean up fba, if specified in yaml if "clean_fba_df_fxn" in v: vLog.info("Cleaning up %s FlowByActivity", k) flows_mapped = dynamically_import_fxn( k, v["clean_fba_df_fxn"])(flows_mapped) # if activity_sets are specified in a file, call them here if 'activity_set_file' in v: aset_names = pd.read_csv(flowbysectoractivitysetspath + v['activity_set_file'], dtype=str) else: aset_names = None # master list of activity names read in from data source ml_act = [] # create dictionary of allocation datasets for different activities activities = v['activity_sets'] # subset activity data and allocate to sector for aset, attr in activities.items(): # subset by named activities if 'activity_set_file' in v: names = \ aset_names[aset_names['activity_set'] == aset]['name'] else: names = attr['names'] # to avoid double counting data from the same source, in # the event there are values in both the APB and ACB # columns, if an activity has already been read in and # allocated, remove that activity from the mapped flows # regardless of what activity set the data was read in flows_mapped = flows_mapped[~( (flows_mapped[fba_activity_fields[0]].isin(ml_act)) | (flows_mapped[fba_activity_fields[1]].isin(ml_act)) )].reset_index(drop=True) ml_act.extend(names) vLog.info("Preparing to handle %s in %s", aset, k) # subset fba data by activity flows_subset = flows_mapped[ (flows_mapped[fba_activity_fields[0]].isin(names)) | (flows_mapped[fba_activity_fields[1]].isin(names) )].reset_index(drop=True) # subset by flowname if exists if 'source_flows' in attr: flows_subset = flows_subset[flows_subset['FlowName'].isin( attr['source_flows'])] if len(flows_subset) == 0: log.warning(f"no data found for flows in {aset}") continue if len(flows_subset[flows_subset['FlowAmount'] != 0]) == 0: log.warning(f"all flow data for {aset} is 0") continue # if activities are sector-like, check sectors are valid if check_activities_sector_like(k): flows_subset2 = replace_naics_w_naics_from_another_year( flows_subset, method['target_sector_source']) # check impact on df FlowAmounts vLog.info( 'Calculate FlowAmount difference caused by ' 'replacing NAICS Codes with %s, saving ' 'difference in Validation log', method['target_sector_source'], ) calculate_flowamount_diff_between_dfs( flows_subset, flows_subset2) else: flows_subset2 = flows_subset.copy() # extract relevant geoscale data or aggregate existing data flows_subset_geo = subset_df_by_geoscale( flows_subset2, v['geoscale_to_use'], attr['allocation_from_scale']) # if loading data subnational geoscale, check for data loss if attr['allocation_from_scale'] != 'national': compare_geographic_totals(flows_subset_geo, flows_mapped, k, attr, aset, names) # Add sectors to df activity, depending on level # of specified sector aggregation log.info("Adding sectors to %s", k) flows_subset_wsec = add_sectors_to_flowbyactivity( flows_subset_geo, sectorsourcename=method['target_sector_source'], allocationmethod=attr['allocation_method']) # clean up fba with sectors, if specified in yaml if "clean_fba_w_sec_df_fxn" in v: vLog.info("Cleaning up %s FlowByActivity with sectors", k) flows_subset_wsec = dynamically_import_fxn( k, v["clean_fba_w_sec_df_fxn"])(flows_subset_wsec, attr=attr, method=method) # rename SourceName to MetaSources and drop columns flows_mapped_wsec = flows_subset_wsec.\ rename(columns={'SourceName': 'MetaSources'}).\ drop(columns=['FlowName', 'Compartment']) # if allocation method is "direct", then no need # to create alloc ratios, else need to use allocation # dataframe to create sector allocation ratios if attr['allocation_method'] == 'direct': fbs = direct_allocation_method(flows_mapped_wsec, k, names, method) # if allocation method for an activity set requires a specific # function due to the complicated nature # of the allocation, call on function here elif attr['allocation_method'] == 'allocation_function': fbs = function_allocation_method(flows_mapped_wsec, k, names, attr, fbs_list) else: fbs = dataset_allocation_method(flows_mapped_wsec, attr, names, method, k, v, aset, aset_names, download_FBA_if_missing) # drop rows where flowamount = 0 # (although this includes dropping suppressed data) fbs = fbs[fbs['FlowAmount'] != 0].reset_index(drop=True) # define grouping columns dependent on sectors # being activity-like or not if check_activities_sector_like(k) is False: groupingcols = fbs_grouping_fields_w_activities groupingdict = flow_by_sector_fields_w_activity else: groupingcols = fbs_default_grouping_fields groupingdict = flow_by_sector_fields # clean df fbs = clean_df(fbs, groupingdict, fbs_fill_na_dict) # aggregate df geographically, if necessary log.info("Aggregating flowbysector to %s level", method['target_geoscale']) # determine from scale if fips_number_key[v['geoscale_to_use']] <\ fips_number_key[attr['allocation_from_scale']]: from_scale = v['geoscale_to_use'] else: from_scale = attr['allocation_from_scale'] fbs_geo_agg = agg_by_geoscale(fbs, from_scale, method['target_geoscale'], groupingcols) # aggregate data to every sector level log.info("Aggregating flowbysector to all sector levels") fbs_sec_agg = sector_aggregation(fbs_geo_agg, groupingcols) # add missing naics5/6 when only one naics5/6 # associated with a naics4 fbs_agg = sector_disaggregation(fbs_sec_agg) # check if any sector information is lost before reaching # the target sector length, if so, # allocate values equally to disaggregated sectors vLog.info( 'Searching for and allocating FlowAmounts for any parent ' 'NAICS that were dropped in the subset to ' '%s child NAICS', method['target_sector_level']) fbs_agg_2 = equally_allocate_parent_to_child_naics( fbs_agg, method['target_sector_level']) # compare flowbysector with flowbyactivity compare_activity_to_sector_flowamounts(flows_mapped_wsec, fbs_agg_2, aset, k, method) # return sector level specified in method yaml # load the crosswalk linking sector lengths sector_list = get_sector_list(method['target_sector_level']) # subset df, necessary because not all of the sectors are # NAICS and can get duplicate rows fbs_1 = fbs_agg_2.loc[ (fbs_agg_2[fbs_activity_fields[0]].isin(sector_list)) & (fbs_agg_2[fbs_activity_fields[1]].isin(sector_list))].\ reset_index(drop=True) fbs_2 = fbs_agg_2.loc[ (fbs_agg_2[fbs_activity_fields[0]].isin(sector_list)) & (fbs_agg_2[fbs_activity_fields[1]].isnull())].\ reset_index(drop=True) fbs_3 = fbs_agg_2.loc[ (fbs_agg_2[fbs_activity_fields[0]].isnull()) & (fbs_agg_2[fbs_activity_fields[1]].isin(sector_list))].\ reset_index(drop=True) fbs_sector_subset = pd.concat([fbs_1, fbs_2, fbs_3]) # drop activity columns fbs_sector_subset = fbs_sector_subset.drop( ['ActivityProducedBy', 'ActivityConsumedBy'], axis=1, errors='ignore') # save comparison of FBA total to FBS total for an activity set compare_fba_geo_subset_and_fbs_output_totals( flows_subset_geo, fbs_sector_subset, aset, k, v, attr, method) log.info("Completed flowbysector for %s", aset) fbs_list.append(fbs_sector_subset) else: if 'clean_fbs_df_fxn' in v: flows = dynamically_import_fxn(v["clean_fbs_df_fxn_source"], v["clean_fbs_df_fxn"])(flows) flows = update_geoscale(flows, method['target_geoscale']) # if the loaded flow dt is already in FBS format, # append directly to list of FBS log.info("Append %s to FBS list", k) # ensure correct field datatypes and add any missing fields flows = clean_df(flows, flow_by_sector_fields, fbs_fill_na_dict) fbs_list.append(flows) # create single df of all activities log.info("Concat data for all activities") fbss = pd.concat(fbs_list, ignore_index=True, sort=False) log.info("Clean final dataframe") # add missing fields, ensure correct data type, # add missing columns, reorder columns fbss = clean_df(fbss, flow_by_sector_fields, fbs_fill_na_dict) # prior to aggregating, replace MetaSources string with all sources # that share context/flowable/sector values fbss = harmonize_FBS_columns(fbss) # aggregate df as activities might have data for # the same specified sector length fbss = aggregator(fbss, fbs_default_grouping_fields) # sort df log.info("Sort and store dataframe") # ensure correct data types/order of columns fbss = clean_df(fbss, flow_by_sector_fields, fbs_fill_na_dict) fbss = fbss.sort_values( ['SectorProducedBy', 'SectorConsumedBy', 'Flowable', 'Context']).reset_index(drop=True) # check for negative flow amounts check_for_negative_flowamounts(fbss) # tmp reset data quality scores fbss = reset_fbs_dq_scores(fbss) # save parquet file meta = set_fb_meta(method_name, "FlowBySector") write_df_to_file(fbss, paths, meta) write_metadata(method_name, method, meta, "FlowBySector") # rename the log file saved to local directory rename_log_file(method_name, meta) log.info( 'See the Validation log for detailed assessment of ' 'model results in %s', logoutputpath)
def equally_allocate_suppressed_parent_to_child_naics(df_load, sector_column, groupcols): """ Estimate data suppression, by equally allocating parent NAICS values to child NAICS :param df_load: df with sector columns :param sector_column: str, column to estimate suppressed data for :param groupcols: list, columns to group df by :return: df, with estimated suppressed data """ df = sector_disaggregation(df_load) df = replace_NoneType_with_empty_cells(df) df = df[df[sector_column] != ''] # determine if activities are sector-like, # if aggregating a df with a 'SourceName' sector_like_activities = False if 'SourceName' in df_load.columns: s = pd.unique(df_load['SourceName'])[0] sector_like_activities = check_activities_sector_like(s) # if activities are source like, drop from df, # add back in as copies of sector columns columns to keep if sector_like_activities: # subset df df_cols = [ e for e in df.columns if e not in ('ActivityProducedBy', 'ActivityConsumedBy') ] df = df[df_cols] # drop activity from groupby groupcols = [ e for e in groupcols if e not in ['ActivityConsumedBy', 'ActivityProducedBy', 'Description'] ] # load naics 2 to naics 6 crosswalk cw_load = load_crosswalk('sector_length') cw_melt = cw_load.melt(id_vars=["NAICS_6"], var_name="NAICS_Length", value_name="NAICS_Match").drop( columns=['NAICS_Length']).drop_duplicates() df_sup = df[df['FlowAmount'] == 0].reset_index(drop=True) # merge the naics cw new_naics = pd.merge(df_sup, cw_melt, how='left', left_on=[sector_column], right_on=['NAICS_Match']) # drop rows where match is null because no additional naics to add new_naics = new_naics.dropna() new_naics[sector_column] = new_naics['NAICS_6'].copy() new_naics = new_naics.drop(columns=['NAICS_6', 'NAICS_Match']) # merge the new naics with the existing df, if data already # existed for a NAICS6, keep the original dfm = pd.merge(new_naics[groupcols], df, how='left', on=groupcols, indicator=True).query('_merge=="left_only"').drop('_merge', axis=1) dfm = replace_NoneType_with_empty_cells(dfm) dfm = dfm.fillna(0) df = pd.concat([df, dfm], sort=True, ignore_index=True) # add length column and subset the data # subtract out existing data at NAICS6 from total data # at a length where no suppressed data df = df.assign(secLength=df[sector_column].apply(lambda x: len(x))) # add column for each state of sector length where # there are no missing values df_sup = df_sup.assign( secLength=df_sup[sector_column].apply(lambda x: len(x))) df_sup2 = (df_sup.groupby( ['FlowName', 'Compartment', 'Location'])['secLength'].agg(lambda x: x.min() - 1).reset_index( name='secLengthsup')) # merge the dfs and sub out the last sector lengths with # all data for each state drop states that don't have suppressed dat df1 = df.merge(df_sup2) df2 = df1[df1['secLength'] == 6].reset_index(drop=True) # determine sector to merge on df2.loc[:, 'mergeSec'] = df2.apply( lambda x: x[sector_column][:x['secLengthsup']], axis=1) sum_cols = [ e for e in fba_default_grouping_fields if e not in ['ActivityConsumedBy', 'ActivityProducedBy'] ] sum_cols.append('mergeSec') df2 = df2.assign( FlowAlloc=df2.groupby(sum_cols)['FlowAmount'].transform('sum')) # rename columns for the merge and define merge cols df2 = df2.rename(columns={ sector_column: 'NewNAICS', 'mergeSec': sector_column }) # keep flows with 0 flow df3 = df2[df2['FlowAmount'] == 0].reset_index(drop=True) m_cols = groupcols + ['NewNAICS', 'FlowAlloc'] # merge the two dfs dfe = df1.merge(df3[m_cols]) # add count column used to divide the unallocated flows dfe = dfe.assign( secCount=dfe.groupby(groupcols)['NewNAICS'].transform('count')) dfe = dfe.assign(newFlow=(dfe['FlowAmount'] - dfe['FlowAlloc']) / dfe['secCount']) # reassign values and drop columns dfe = dfe.assign(FlowAmount=dfe['newFlow']) dfe[sector_column] = dfe['NewNAICS'].copy() dfe = dfe.drop(columns=['NewNAICS', 'FlowAlloc', 'secCount', 'newFlow']) # new df with estimated naics6 dfn = pd.concat([df, dfe], ignore_index=True) dfn2 = dfn[dfn['FlowAmount'] != 0].reset_index(drop=True) dfn2 = dfn2.drop(columns=['secLength']) dff = sector_aggregation(dfn2, fba_wsec_default_grouping_fields) # if activities are source-like, set col values as copies # of the sector columns if sector_like_activities: dff = dff.assign(ActivityProducedBy=dff['SectorProducedBy']) dff = dff.assign(ActivityConsumedBy=dff['SectorConsumedBy']) # reindex columns dff = dff.reindex(df_load.columns, axis=1) # replace null values dff = replace_strings_with_NoneType(dff).reset_index(drop=True) return dff
def sector_disaggregation(df_load): """ function to disaggregate sectors if there is only one naics at a lower level works for lower than naics 4 :param df_load: A FBS df, must have sector columns :return: A FBS df with values for the missing naics5 and naics6 """ # ensure None values are not strings df = replace_NoneType_with_empty_cells(df_load) # determine if activities are sector-like, if aggregating # a df with a 'SourceName' sector_like_activities = False if 'SourceName' in df_load.columns: s = pd.unique(df_load['SourceName'])[0] sector_like_activities = check_activities_sector_like(s) # if activities are source like, drop from df, # add back in as copies of sector columns columns to keep if sector_like_activities: # subset df df_cols = [ e for e in df.columns if e not in ('ActivityProducedBy', 'ActivityConsumedBy') ] df = df[df_cols] # load naics 2 to naics 6 crosswalk cw_load = load_crosswalk('sector_length') # for loop min length to 6 digits, where min length cannot be less than 2 fields_list = [] for i in range(2): if not (df[fbs_activity_fields[i]] == "").all(): fields_list.append(fbs_activity_fields[i]) length = df[fields_list].apply(lambda x: x.str.len()).min().min() if length < 2: length = 2 # appends missing naics levels to df for i in range(length, 6): sector_merge = 'NAICS_' + str(i) sector_add = 'NAICS_' + str(i + 1) # subset the df by naics length cw = cw_load[[sector_merge, sector_add]] # only keep the rows where there is only one value # in sector_add for a value in sector_merge cw = cw.drop_duplicates(subset=[sector_merge], keep=False).reset_index(drop=True) sector_list = cw[sector_merge].values.tolist() # subset df to sectors with length = i and length = i + 1 df_subset = df.loc[ df[fbs_activity_fields[0]].apply(lambda x: i + 1 >= len(x) >= i) | df[fbs_activity_fields[1]].apply(lambda x: i + 1 >= len(x) >= i)] # create new columns that are length i df_subset = df_subset.assign(SectorProduced_tmp=df_subset[ fbs_activity_fields[0]].apply(lambda x: x[0:i])) df_subset = df_subset.assign(SectorConsumed_tmp=df_subset[ fbs_activity_fields[1]].apply(lambda x: x[0:i])) # subset the df to the rows where the tmp sector columns # are in naics list df_subset_1 = df_subset.loc[ (df_subset['SectorProduced_tmp'].isin(sector_list)) & (df_subset['SectorConsumed_tmp'] == "")] df_subset_2 = df_subset.loc[(df_subset['SectorProduced_tmp'] == "") & ( df_subset['SectorConsumed_tmp'].isin(sector_list))] df_subset_3 = df_subset.loc[ (df_subset['SectorProduced_tmp'].isin(sector_list)) & (df_subset['SectorConsumed_tmp'].isin(sector_list))] # concat existing dfs df_subset = pd.concat([df_subset_1, df_subset_2, df_subset_3], sort=False) # drop all rows with duplicate temp values, as a less aggregated # naics exists list of column headers, that if exist in df, should # be aggregated using the weighted avg fxn possible_column_headers = ('Flowable', 'FlowName', 'Unit', 'Context', 'Compartment', 'Location', 'Year', 'SectorProduced_tmp', 'SectorConsumed_tmp') # list of column headers that do exist in the df being subset cols_to_drop = [ e for e in possible_column_headers if e in df_subset.columns.values.tolist() ] df_subset = df_subset.drop_duplicates( subset=cols_to_drop, keep=False).reset_index(drop=True) # merge the naics cw new_naics = pd.merge(df_subset, cw[[sector_merge, sector_add]], how='left', left_on=['SectorProduced_tmp'], right_on=[sector_merge]) new_naics = new_naics.rename(columns={sector_add: "SPB"}) new_naics = new_naics.drop(columns=[sector_merge]) new_naics = pd.merge(new_naics, cw[[sector_merge, sector_add]], how='left', left_on=['SectorConsumed_tmp'], right_on=[sector_merge]) new_naics = new_naics.rename(columns={sector_add: "SCB"}) new_naics = new_naics.drop(columns=[sector_merge]) # drop columns and rename new sector columns new_naics = new_naics.drop(columns=[ "SectorProducedBy", "SectorConsumedBy", "SectorProduced_tmp", "SectorConsumed_tmp" ]) new_naics = new_naics.rename(columns={ "SPB": "SectorProducedBy", "SCB": "SectorConsumedBy" }) # append new naics to df new_naics['SectorConsumedBy'] = \ new_naics['SectorConsumedBy'].replace({np.nan: ""}) new_naics['SectorProducedBy'] = \ new_naics['SectorProducedBy'].replace({np.nan: ""}) new_naics = replace_NoneType_with_empty_cells(new_naics) df = pd.concat([df, new_naics], sort=True, ignore_index=True) # replace blank strings with None df = replace_strings_with_NoneType(df) # if activities are source-like, set col values # as copies of the sector columns if sector_like_activities: df = df.assign(ActivityProducedBy=df['SectorProducedBy']) df = df.assign(ActivityConsumedBy=df['SectorConsumedBy']) # reindex columns df = df.reindex(df_load.columns, axis=1) return df
def sector_aggregation(df_load, group_cols): """ Function that checks if a sector length exists, and if not, sums the less aggregated sector :param df_load: Either a flowbyactivity df with sectors or a flowbysector df :param group_cols: columns by which to aggregate :return: df, with aggregated sector values """ # ensure None values are not strings df = replace_NoneType_with_empty_cells(df_load) # determine if activities are sector-like, # if aggregating a df with a 'SourceName' sector_like_activities = False if 'SourceName' in df_load.columns: s = pd.unique(df_load['SourceName'])[0] sector_like_activities = check_activities_sector_like(s) # if activities are source like, drop from df and group calls, # add back in as copies of sector columns columns to keep if sector_like_activities: group_cols = [ e for e in group_cols if e not in ('ActivityProducedBy', 'ActivityConsumedBy') ] # subset df df_cols = [ e for e in df.columns if e not in ('ActivityProducedBy', 'ActivityConsumedBy') ] df = df[df_cols] # find the longest length sector length = df[[fbs_activity_fields[0], fbs_activity_fields[1] ]].apply(lambda x: x.str.len()).max().max() length = int(length) # for loop in reverse order longest length naics minus 1 to 2 # appends missing naics levels to df for i in range(length, 2, -1): # df where either sector column is length or both columns are df1 = df[((df['SectorProducedBy'].apply(lambda x: len(x) == i)) | (df['SectorConsumedBy'].apply(lambda x: len(x) == i))) | ((df['SectorProducedBy'].apply(lambda x: len(x) == i)) & (df['SectorConsumedBy'].apply(lambda x: len(x) == i)))] # add new columns dropping last digit of sectors df1 = df1.assign( SPB=df1['SectorProducedBy'].apply(lambda x: x[0:i - 1])) df1 = df1.assign( SCB=df1['SectorConsumedBy'].apply(lambda x: x[0:i - 1])) # second dataframe where length is l - 1 df2 = df[((df['SectorProducedBy'].apply(lambda x: len(x) == i - 1)) | (df['SectorConsumedBy'].apply(lambda x: len(x) == i - 1))) | ((df['SectorProducedBy'].apply(lambda x: len(x) == i - 1)) & (df['SectorConsumedBy'].apply(lambda x: len(x) == i - 1)) )].rename(columns={ 'SectorProducedBy': 'SPB', 'SectorConsumedBy': 'SCB' }) # merge the dfs merge_cols = [col for col in df2.columns if hasattr(df2[col], 'str')] # also drop activity and description cols merge_cols = [ c for c in merge_cols if c not in ['ActivityConsumedBy', 'ActivityProducedBy', 'Description'] ] if len(df2) > 0: dfm = df1.merge(df2[merge_cols], how='outer', on=merge_cols, indicator=True).query('_merge=="left_only"').drop( '_merge', axis=1) else: dfm = df1.copy(deep=True) if len(dfm) > 0: # replace the SCB and SPB columns then aggregate and add to df dfm['SectorProducedBy'] = dfm['SPB'] dfm['SectorConsumedBy'] = dfm['SCB'] dfm = dfm.drop(columns=(['SPB', 'SCB'])) # aggregate the new sector flow amounts agg_sectors = aggregator(dfm, group_cols) # append to df agg_sectors = replace_NoneType_with_empty_cells(agg_sectors) df = df.append(agg_sectors, sort=False).reset_index(drop=True) df = df.drop_duplicates() # if activities are source-like, set col values as # copies of the sector columns if sector_like_activities: df = df.assign(ActivityProducedBy=df['SectorProducedBy']) df = df.assign(ActivityConsumedBy=df['SectorConsumedBy']) # reindex columns df = df.reindex(df_load.columns, axis=1) # replace null values df = replace_strings_with_NoneType(df).reset_index(drop=True) return df
def get_fba_allocation_subset(fba_allocation, source, activitynames, **kwargs): """ Subset the fba allocation data based on NAICS associated with activity :param fba_allocation: df, FBA format :param source: str, source name :param activitynames: list, activity names in activity set :param kwargs: can be the mapping file and method of allocation :return: df, FBA subset """ # first determine if there are special cases that would modify the # typical method of subset an example of a special case is when the # allocation method is 'proportional-flagged' subset_by_sector_cols = False subset_by_column_value = False if kwargs != {}: if 'flowSubsetMapped' in kwargs: fsm = kwargs['flowSubsetMapped'] if 'allocMethod' in kwargs: am = kwargs['allocMethod'] if am == 'proportional-flagged': subset_by_sector_cols = True if 'activity_set_names' in kwargs: asn = kwargs['activity_set_names'] if asn is not None: if 'allocation_subset_col' in asn: subset_by_column_value = True if check_activities_sector_like(source) is False: # read in source crosswalk df = get_activitytosector_mapping(source) sec_source_name = df['SectorSourceName'][0] df = expand_naics_list(df, sec_source_name) # subset source crosswalk to only contain values # pertaining to list of activity names df = df.loc[df['Activity'].isin(activitynames)] # turn column of sectors related to activity names into list sector_list = pd.unique(df['Sector']).tolist() # subset fba allocation table to the values in # the activity list, based on overlapping sectors if 'Sector' in fba_allocation: fba_allocation_subset =\ fba_allocation.loc[fba_allocation['Sector'].isin( sector_list)].reset_index(drop=True) else: fba_allocation_subset = \ fba_allocation.loc[ (fba_allocation[fbs_activity_fields[0]].isin(sector_list) ) | (fba_allocation[fbs_activity_fields[1]].isin(sector_list) )].reset_index(drop=True) else: if 'Sector' in fba_allocation: fba_allocation_subset =\ fba_allocation.loc[fba_allocation['Sector'].isin( activitynames)].reset_index(drop=True) elif subset_by_sector_cols: # if it is a special case, then base the subset of data on # sectors in the sector columns, not on activitynames fsm_sub = fsm.loc[ (fsm[fba_activity_fields[0]].isin(activitynames)) | (fsm[fba_activity_fields[1]].isin(activitynames))].reset_index( drop=True) part1 = fsm_sub[['SectorConsumedBy']] part2 = fsm_sub[['SectorProducedBy']] part1.columns = ['Sector'] part2.columns = ['Sector'] modified_activitynames = \ pd.concat([part1, part2], ignore_index=True).drop_duplicates() modified_activitynames = modified_activitynames[ modified_activitynames['Sector'].notnull()] modified_activitynames = modified_activitynames['Sector'].tolist() fba_allocation_subset = fba_allocation.loc[ (fba_allocation[fbs_activity_fields[0]]. isin(modified_activitynames)) | (fba_allocation[fbs_activity_fields[1]]. isin(modified_activitynames))].reset_index(drop=True) else: fba_allocation_subset = fba_allocation.loc[ (fba_allocation[fbs_activity_fields[0]].isin(activitynames)) | (fba_allocation[fbs_activity_fields[1]].isin(activitynames) )].reset_index(drop=True) # if activity set names included in function call and activity set names # is not null, then subset data based on value and column specified if subset_by_column_value: # create subset of activity names and allocation subset metrics asn_subset = \ asn[asn['name'].isin(activitynames)].reset_index(drop=True) if asn_subset['allocation_subset'].isna().all(): pass elif asn_subset['allocation_subset'].isna().any(): log.error('Define column and value to subset on in the activity ' 'set csv for all rows') else: col_to_subset = asn_subset['allocation_subset_col'][0] val_to_subset = asn_subset['allocation_subset'][0] # subset fba_allocation_subset further log.debug('Subset the allocation dataset where %s = %s', str(col_to_subset), str(val_to_subset)) fba_allocation_subset = fba_allocation_subset[ fba_allocation_subset[col_to_subset] == val_to_subset].reset_index(drop=True) return fba_allocation_subset