def proportional_allocation_by_location(df): """ Creates a proportional allocation based on all the most aggregated sectors within a location Ensure that sectors are at 2 digit level - can run sector_aggregation() prior to using this function :param df: df, includes sector columns :param sectorcolumn: str, sector column by which to base allocation :return: df, with 'FlowAmountRatio' column """ # tmp drop NoneType df = replace_NoneType_with_empty_cells(df) # find the shortest length sector denom_df = df.loc[(df['SectorProducedBy'].apply(lambda x: len(x) == 2)) | (df['SectorConsumedBy'].apply(lambda x: len(x) == 2))] denom_df = denom_df.assign(Denominator=denom_df['FlowAmount'].groupby( denom_df['Location']).transform('sum')) denom_df_2 = denom_df[[ 'Location', 'LocationSystem', 'Year', 'Denominator' ]].drop_duplicates() # merge the denominator column with fba_w_sector df allocation_df = df.merge(denom_df_2, how='left') # calculate ratio allocation_df.loc[:, 'FlowAmountRatio'] = allocation_df[ 'FlowAmount'] / allocation_df['Denominator'] allocation_df = allocation_df.drop(columns=['Denominator']).reset_index() # add nonetypes allocation_df = replace_strings_with_NoneType(allocation_df) return allocation_df
def proportional_allocation_by_location_and_activity(df, sectorcolumn): """ Creates a proportional allocation within each aggregated sector within a location :param df: df with sector columns :param sectorcolumn: str, sector column for which to create allocation ratios :return: df, with 'FlowAmountRatio' and 'HelperFlow' columns """ # tmp replace NoneTypes with empty cells df = replace_NoneType_with_empty_cells(df) # denominator summed from highest level of sector grouped by location short_length = min(df[sectorcolumn].apply(lambda x: len(str(x))).unique()) # want to create denominator based on short_length denom_df = df.loc[df[sectorcolumn].apply( lambda x: len(x) == short_length)].reset_index(drop=True) grouping_cols = [ e for e in [ 'FlowName', 'Location', 'Activity', 'ActivityConsumedBy', 'ActivityProducedBy' ] if e in denom_df.columns.values.tolist() ] denom_df.loc[:, 'Denominator'] = denom_df.groupby( grouping_cols)['HelperFlow'].transform('sum') # list of column headers, that if exist in df, should be aggregated using the weighted avg fxn possible_column_headers = ('Location', 'LocationSystem', 'Year', 'Activity', 'ActivityConsumedBy', 'ActivityProducedBy') # list of column headers that do exist in the df being aggregated column_headers = [ e for e in possible_column_headers if e in denom_df.columns.values.tolist() ] merge_headers = column_headers.copy() column_headers.append('Denominator') # create subset of denominator values based on Locations and Activities denom_df_2 = denom_df[column_headers].drop_duplicates().reset_index( drop=True) # merge the denominator column with fba_w_sector df allocation_df = df.merge(denom_df_2, how='left', left_on=merge_headers, right_on=merge_headers) # calculate ratio allocation_df.loc[:, 'FlowAmountRatio'] = \ allocation_df['HelperFlow'] / allocation_df['Denominator'] allocation_df = allocation_df.drop(columns=['Denominator']).reset_index( drop=True) # fill empty cols with NoneType allocation_df = replace_strings_with_NoneType(allocation_df) # fill na values with 0 allocation_df['HelperFlow'] = allocation_df['HelperFlow'].fillna(0) return allocation_df
def write_naics_2012_crosswalk(): """ Create a NAICS 2 - 6 digit crosswalk :return: """ # load the useeior mastercrosswalk subset to the naics timeseries cw_load = load_crosswalk('sector_timeseries') # load BEA codes that will act as NAICS house = load_crosswalk('household') govt = load_crosswalk('government') bea = pd.concat([house, govt], ignore_index=True).rename( columns={'Code': 'NAICS_2012_Code', 'NAICS_Level_to_Use_For': 'secLength'}) bea = bea[['NAICS_2012_Code', 'secLength']] # extract naics 2012 code column and drop duplicates and empty cells cw = cw_load[['NAICS_2012_Code']].drop_duplicates() cw = replace_NoneType_with_empty_cells(cw) cw = cw[cw['NAICS_2012_Code'] != ''] # also drop the existing household and government codes because not all # inclusive and does not conform to NAICS length standards cw = cw[~cw['NAICS_2012_Code'].str.startswith( tuple(['F0', 'S0']))].reset_index(drop=True) # add column of sector length cw['secLength'] = cw['NAICS_2012_Code'].apply( lambda x: f"NAICS_{str(len(x))}") # add bea codes subbing for NAICS cw2 = pd.concat([cw, bea], ignore_index=True) # create dictionary of dataframes d = dict(tuple(cw2.groupby('secLength'))) for l in range(2, 9): d[f'NAICS_{l}'] = d[f'NAICS_{l}'][['NAICS_2012_Code']].reset_index( drop=True).rename( columns={'NAICS_2012_Code': f'NAICS_{l}'}) naics_cw = d['NAICS_2'] for l in range(3, 7): naics_cw = (d[f'NAICS_{l}'].assign(temp=d[f'NAICS_{l}'][ f'NAICS_{l}'].str.extract( pat=f"({'|'.join(naics_cw[f'NAICS_{l-1}'])})")).merge( naics_cw, how='right', left_on='temp', right_on=f'NAICS_{l-1}', suffixes=['', '_y'])).drop(columns=['temp']) # reorder naics_cw = naics_cw.reindex(sorted(naics_cw.columns), axis=1) # save as csv naics_cw.to_csv(datapath + "NAICS_2012_Crosswalk.csv", index=False)
def proportional_allocation(df, attr): """ Creates a proportional allocation based on all the most aggregated sectors within a location Ensure that sectors are at 2 digit level - can run sector_aggregation() prior to using this function :param df: df, includes sector columns :param attr: dictionary, attributes for an activity set :return: df, with 'FlowAmountRatio' column """ # tmp drop NoneType df = replace_NoneType_with_empty_cells(df) # determine if any additional columns beyond location and sector by which # to base allocation ratios if 'allocation_merge_columns' in attr: groupby_cols = ['Location'] + attr['allocation_merge_columns'] denom_subset_cols = [ 'Location', 'LocationSystem', 'Year', 'Denominator' ] + attr['allocation_merge_columns'] else: groupby_cols = ['Location'] denom_subset_cols = [ 'Location', 'LocationSystem', 'Year', 'Denominator' ] denom_df = df.loc[(df['SectorProducedBy'].apply(lambda x: len(x) == 2)) | (df['SectorConsumedBy'].apply(lambda x: len(x) == 2))] # generate denominator based on identified groupby cols denom_df = denom_df.assign(Denominator=denom_df.groupby(groupby_cols) ['FlowAmount'].transform('sum')) # subset select columns by which to generate ratios denom_df_2 = denom_df[denom_subset_cols].drop_duplicates() # merge the denominator column with fba_w_sector df allocation_df = df.merge(denom_df_2, how='left') # calculate ratio allocation_df.loc[:, 'FlowAmountRatio'] = \ allocation_df['FlowAmount'] / allocation_df['Denominator'] allocation_df = allocation_df.drop(columns=['Denominator']).reset_index() # add nonetypes allocation_df = replace_strings_with_NoneType(allocation_df) return allocation_df
def aggregator(df, groupbycols): """ Aggregates flowbyactivity or flowbysector 'FlowAmount' column in df and generate weighted average values based on FlowAmount values for numeric columns :param df: df, Either flowbyactivity or flowbysector :param groupbycols: list, Either flowbyactivity or flowbysector columns :return: df, with aggregated columns """ # reset index df = df.reset_index(drop=True) # tmp replace null values with empty cells df = replace_NoneType_with_empty_cells(df) # drop columns with flowamount = 0 df = df[df['FlowAmount'] != 0] # list of column headers, that if exist in df, should be # aggregated using the weighted avg fxn possible_column_headers = \ ('Spread', 'Min', 'Max', 'DataReliability', 'TemporalCorrelation', 'GeographicalCorrelation', 'TechnologicalCorrelation', 'DataCollection') # list of column headers that do exist in the df being aggregated column_headers = [ e for e in possible_column_headers if e in df.columns.values.tolist() ] df_dfg = df.groupby(groupbycols).agg({'FlowAmount': ['sum']}) # run through other columns creating weighted average for e in column_headers: df_dfg[e] = get_weighted_average(df, e, 'FlowAmount', groupbycols) df_dfg = df_dfg.reset_index() df_dfg.columns = df_dfg.columns.droplevel(level=1) # if datatypes are strings, ensure that Null values remain NoneType df_dfg = replace_strings_with_NoneType(df_dfg) return df_dfg
def disaggregate_pastureland(fba_w_sector, attr, method, year, sector_column): """ The USDA CoA Cropland irrigated pastureland data only links to the 3 digit NAICS '112'. This function uses state level CoA 'Land in Farms' to allocate the county level acreage data to 6 digit NAICS. :param fba_w_sector: df, the CoA Cropland dataframe after linked to sectors :param attr: dictionary, attribute data from method yaml for activity set :param year: str, year of data being disaggregated :param sector_column: str, the sector column on which to make df modifications (SectorProducedBy or SectorConsumedBy) :return: df, the CoA cropland dataframe with disaggregated pastureland data """ # tmp drop NoneTypes fba_w_sector = replace_NoneType_with_empty_cells(fba_w_sector) # subset the coa data so only pastureland p = fba_w_sector.loc[fba_w_sector[sector_column].apply(lambda x: x[0:3]) == '112'].reset_index(drop=True) if len(p) != 0: # add temp loc column for state fips p = p.assign(Location_tmp=p['Location'].apply(lambda x: x[0:2])) # load usda coa cropland naics df_f = load_fba_w_standardized_units( datasource='USDA_CoA_Cropland_NAICS', year=year, flowclass='Land') # subset to land in farms data df_f = df_f[df_f['FlowName'] == 'FARM OPERATIONS'] # subset to rows related to pastureland df_f = df_f.loc[df_f['ActivityConsumedBy'].apply(lambda x: x[0:3]) == '112'] # drop rows with "&' df_f = df_f[~df_f['ActivityConsumedBy'].str.contains('&')] # create sector columns df_f = add_sectors_to_flowbyactivity( df_f, sectorsourcename=method['target_sector_source']) # estimate suppressed data by equal allocation df_f = estimate_suppressed_data(df_f, 'SectorConsumedBy', 3, 'USDA_CoA_Cropland_NAICS') # create proportional ratios group_cols = fba_wsec_default_grouping_fields group_cols = [ e for e in group_cols if e not in ('ActivityProducedBy', 'ActivityConsumedBy') ] df_f = allocate_by_sector(df_f, 'proportional', group_cols) # tmp drop NoneTypes df_f = replace_NoneType_with_empty_cells(df_f) # drop naics = '11 df_f = df_f[df_f[sector_column] != '11'] # drop 000 in location df_f = df_f.assign(Location=df_f['Location'].apply(lambda x: x[0:2])) # check units before merge compare_df_units(p, df_f) # merge the coa pastureland data with land in farm data df = p.merge(df_f[[sector_column, 'Location', 'FlowAmountRatio']], how='left', left_on="Location_tmp", right_on="Location") # multiply the flowamount by the flowratio df.loc[:, 'FlowAmount'] = df['FlowAmount'] * df['FlowAmountRatio'] # drop columns and rename df = df.drop(columns=[ 'Location_tmp', sector_column + '_x', 'Location_y', 'FlowAmountRatio' ]) df = df.rename(columns={ sector_column + '_y': sector_column, "Location_x": 'Location' }) # drop rows where sector = 112 and then concat with original fba_w_sector fba_w_sector = fba_w_sector[fba_w_sector[sector_column].apply( lambda x: x[0:3]) != '112'].reset_index(drop=True) fba_w_sector = pd.concat([fba_w_sector, df], sort=True).reset_index(drop=True) # fill empty cells with NoneType fba_w_sector = replace_strings_with_NoneType(fba_w_sector) return fba_w_sector
def allocate_dropped_sector_data(df_load, target_sector_level): """ Determine rows of data that will be lost if subset data at target sector level Equally allocate parent NAICS to child NAICS where child NAICS missing :param df: df, FBS format :param target_sector_level: str, target NAICS level for FBS output :return: df, with all child NAICS at target sector level """ # exclude nonsectors df = replace_NoneType_with_empty_cells(df_load) rows_lost = pd.DataFrame() for i in range(2, sector_level_key[target_sector_level]): # create df of i length df_x1 = df.loc[ (df[fbs_activity_fields[0]].apply(lambda x: len(x) == i)) & (df[fbs_activity_fields[1]] == '')] df_x2 = df.loc[(df[fbs_activity_fields[0]] == '') & ( df[fbs_activity_fields[1]].apply(lambda x: len(x) == i))] df_x3 = df.loc[ (df[fbs_activity_fields[0]].apply(lambda x: len(x) == i)) & (df[fbs_activity_fields[1]].apply(lambda x: len(x) == i))] df_x = pd.concat([df_x1, df_x2, df_x3], ignore_index=True, sort=False) # create df of i + 1 length df_y1 = df.loc[ df[fbs_activity_fields[0]].apply(lambda x: len(x) == i + 1) | df[fbs_activity_fields[1]].apply(lambda x: len(x) == i + 1)] df_y2 = df.loc[ df[fbs_activity_fields[0]].apply(lambda x: len(x) == i + 1) & df[fbs_activity_fields[1]].apply(lambda x: len(x) == i + 1)] df_y = pd.concat([df_y1, df_y2], ignore_index=True, sort=False) # create temp sector columns in df y, that are i digits in length df_y.loc[:, 'spb_tmp'] = df_y[fbs_activity_fields[0]].apply( lambda x: x[0:i]) df_y.loc[:, 'scb_tmp'] = df_y[fbs_activity_fields[1]].apply( lambda x: x[0:i]) # don't modify household sector lengths or gov't transport df_y = df_y.replace({'F0': 'F010', 'F01': 'F010'}) # merge the two dfs df_m = pd.merge(df_x, df_y[[ 'Class', 'Context', 'FlowType', 'Flowable', 'Location', 'LocationSystem', 'Unit', 'Year', 'spb_tmp', 'scb_tmp' ]], how='left', left_on=[ 'Class', 'Context', 'FlowType', 'Flowable', 'Location', 'LocationSystem', 'Unit', 'Year', 'SectorProducedBy', 'SectorConsumedBy' ], right_on=[ 'Class', 'Context', 'FlowType', 'Flowable', 'Location', 'LocationSystem', 'Unit', 'Year', 'spb_tmp', 'scb_tmp' ]) # extract the rows that are not disaggregated to more specific naics rl = df_m[(df_m['scb_tmp'].isnull()) & (df_m['spb_tmp'].isnull())].reset_index(drop=True) # clean df rl = replace_strings_with_NoneType(rl) rl_list = rl[['SectorProducedBy', 'SectorConsumedBy']].drop_duplicates().values.tolist() # match sectors with target sector length sectors # import cw and subset to current sector length and target sector length cw_load = load_sector_length_crosswalk() nlength = list(sector_level_key.keys())[list( sector_level_key.values()).index(i)] cw = cw_load[[nlength, target_sector_level]].drop_duplicates() # add column with counts cw['sector_count'] = cw.groupby(nlength)[nlength].transform('count') # merge df & conditionally replace sector produced/consumed columns rl_m = pd.merge(rl, cw, how='left', left_on=[fbs_activity_fields[0]], right_on=[nlength]) rl_m.loc[rl_m[fbs_activity_fields[0]] != '', fbs_activity_fields[0]] = rl_m[target_sector_level] rl_m = rl_m.drop(columns=[nlength, target_sector_level]) rl_m2 = pd.merge(rl_m, cw, how='left', left_on=[fbs_activity_fields[1]], right_on=[nlength]) rl_m2.loc[rl_m2[fbs_activity_fields[1]] != '', fbs_activity_fields[1]] = rl_m2[target_sector_level] rl_m2 = rl_m2.drop(columns=[nlength, target_sector_level]) # create one sector count column rl_m2['sector_count_x'] = rl_m2['sector_count_x'].fillna( rl_m2['sector_count_y']) rl_m3 = rl_m2.rename(columns={'sector_count_x': 'sector_count'}) rl_m3 = rl_m3.drop(columns=['sector_count_y']) # calculate new flow amounts, based on sector count, # allocating equally to the new sector length codes rl_m3['FlowAmount'] = rl_m3['FlowAmount'] / rl_m3['sector_count'] rl_m3 = rl_m3.drop(columns=['sector_count']) # append to df if len(rl) != 0: vLogDetailed.warning( 'Data found at %s digit NAICS not represented in current ' 'data subset: {}'.format(' '.join(map(str, rl_list))), str(i)) rows_lost = rows_lost.append(rl_m3, ignore_index=True) if len(rows_lost) != 0: vLogDetailed.info( 'Allocating FlowAmounts equally to each %s associated with ' 'the sectors previously dropped', target_sector_level) # add rows of missing data to the fbs sector subset df_w_lost_data = pd.concat([df, rows_lost], ignore_index=True, sort=True) df_w_lost_data = replace_strings_with_NoneType(df_w_lost_data) return df_w_lost_data
def estimate_suppressed_data(df, sector_column, naics_level, sourcename): """ Estimate data suppression, by equally allocating parent NAICS values to child NAICS :param df: df with sector columns :param sector_column: str, column to estimate suppressed data for :param naics_level: numeric, indicate at what NAICS length to base estimated suppresed data off (2 - 5) :param sourcename: str, sourcename :return: df, with estimated suppressed data """ # exclude nonsectors df = replace_NoneType_with_empty_cells(df) # find the longest length sector max_length = max(df[sector_column].apply(lambda x: len(str(x))).unique()) # loop through starting at naics_level, use most detailed level possible to save time for i in range(naics_level, max_length): # create df of i length df_x = df.loc[df[sector_column].apply(lambda x: len(x) == i)] # create df of i + 1 length df_y = df.loc[df[sector_column].apply(lambda x: len(x) == i + 1)] # create temp sector columns in df y, that are i digits in length df_y = df_y.assign(s_tmp=df_y[sector_column].apply(lambda x: x[0:i])) # create list of location and temp activity combos that contain a 0 missing_sectors_df = df_y[df_y['FlowAmount'] == 0] missing_sectors_list = missing_sectors_df[['Location', 's_tmp']].drop_duplicates().values.tolist() # subset the y df if len(missing_sectors_list) != 0: # new df of sectors that start with missing sectors. # drop last digit of the sector and sum flows set conditions suppressed_list = [] for q, r, in missing_sectors_list: c1 = df_y['Location'] == q c2 = df_y['s_tmp'] == r # subset data suppressed_list.append(df_y.loc[c1 & c2]) suppressed_sectors = pd.concat(suppressed_list, sort=False, ignore_index=True) # add column of existing allocated data for length of i suppressed_sectors['alloc_flow'] =\ suppressed_sectors.groupby(['Location', 's_tmp'])['FlowAmount'].transform('sum') # subset further so only keep rows of 0 value suppressed_sectors_sub = suppressed_sectors[suppressed_sectors['FlowAmount'] == 0] # add count suppressed_sectors_sub = \ suppressed_sectors_sub.assign(sector_count= suppressed_sectors_sub.groupby( ['Location', 's_tmp'] )['s_tmp'].transform('count')) # merge suppressed sector subset with df x df_m = pd.merge(df_x, suppressed_sectors_sub[['Class', 'Compartment', 'FlowType', 'FlowName', 'Location', 'LocationSystem', 'Unit', 'Year', sector_column, 's_tmp', 'alloc_flow', 'sector_count']], left_on=['Class', 'Compartment', 'FlowType', 'FlowName', 'Location', 'LocationSystem', 'Unit', 'Year', sector_column], right_on=['Class', 'Compartment', 'FlowType', 'FlowName', 'Location', 'LocationSystem', 'Unit', 'Year', 's_tmp'], how='right') # drop any rows where flowamount is none df_m = df_m[~df_m['FlowAmount'].isna()] # calculate estimated flows by subtracting the flow # amount already allocated from total flow of # sector one level up and divide by number of sectors with suppressed data df_m.loc[:, 'FlowAmount'] = \ (df_m['FlowAmount'] - df_m['alloc_flow']) / df_m['sector_count'] # only keep the suppressed sector subset activity columns df_m = df_m.drop(columns=[sector_column + '_x', 's_tmp', 'alloc_flow', 'sector_count']) df_m = df_m.rename(columns={sector_column + '_y': sector_column}) # reset activity columns if load_source_catalog()[sourcename]['sector-like_activities']: df_m = df_m.assign(ActivityProducedBy=df_m['SectorProducedBy']) df_m = df_m.assign(ActivityConsumedBy=df_m['SectorConsumedBy']) # drop the existing rows with suppressed data and append the new estimates from fba df modified_df =\ pd.merge(df, df_m[['FlowName', 'Location', sector_column]], indicator=True, how='outer').query('_merge=="left_only"').drop('_merge', axis=1) df = pd.concat([modified_df, df_m], ignore_index=True) df_w_estimated_data = replace_strings_with_NoneType(df) return df_w_estimated_data
def sector_disaggregation(df): """ function to disaggregate sectors if there is only one naics at a lower level works for lower than naics 4 :param df: A FBS df, must have sector columns :return: A FBS df with values for the missing naics5 and naics6 """ # ensure None values are not strings df = replace_NoneType_with_empty_cells(df) # load naics 2 to naics 6 crosswalk cw_load = load_sector_length_crosswalk() # for loop min length to 6 digits, where min length cannot be less than 2 length = df[[fbs_activity_fields[0], fbs_activity_fields[1]]].apply( lambda x: x.str.len()).min().min() if length < 2: length = 2 # appends missing naics levels to df for i in range(length, 6): sector_merge = 'NAICS_' + str(i) sector_add = 'NAICS_' + str(i+1) # subset the df by naics length cw = cw_load[[sector_merge, sector_add]] # only keep the rows where there is only one value in sector_add for a value in sector_merge cw = cw.drop_duplicates(subset=[sector_merge], keep=False).reset_index(drop=True) sector_list = cw[sector_merge].values.tolist() # subset df to sectors with length = i and length = i + 1 df_subset = df.loc[df[fbs_activity_fields[0]].apply(lambda x: i + 1 >= len(x) >= i) | df[fbs_activity_fields[1]].apply(lambda x: i + 1 >= len(x) >= i)] # create new columns that are length i df_subset = df_subset.assign(SectorProduced_tmp= df_subset[fbs_activity_fields[0]].apply(lambda x: x[0:i])) df_subset = df_subset.assign(SectorConsumed_tmp= df_subset[fbs_activity_fields[1]].apply(lambda x: x[0:i])) # subset the df to the rows where the tmp sector columns are in naics list df_subset_1 = df_subset.loc[(df_subset['SectorProduced_tmp'].isin(sector_list)) & (df_subset['SectorConsumed_tmp'] == "")] df_subset_2 = df_subset.loc[(df_subset['SectorProduced_tmp'] == "") & (df_subset['SectorConsumed_tmp'].isin(sector_list))] df_subset_3 = df_subset.loc[(df_subset['SectorProduced_tmp'].isin(sector_list)) & (df_subset['SectorConsumed_tmp'].isin(sector_list))] # concat existing dfs df_subset = pd.concat([df_subset_1, df_subset_2, df_subset_3], sort=False) # drop all rows with duplicate temp values, as a less aggregated naics exists # list of column headers, that if exist in df, should be # aggregated using the weighted avg fxn possible_column_headers = ('Flowable', 'FlowName', 'Unit', 'Context', 'Compartment', 'Location', 'Year', 'SectorProduced_tmp', 'SectorConsumed_tmp') # list of column headers that do exist in the df being subset cols_to_drop = [e for e in possible_column_headers if e in df_subset.columns.values.tolist()] df_subset = df_subset.drop_duplicates(subset=cols_to_drop, keep=False).reset_index(drop=True) # merge the naics cw new_naics = pd.merge(df_subset, cw[[sector_merge, sector_add]], how='left', left_on=['SectorProduced_tmp'], right_on=[sector_merge]) new_naics = new_naics.rename(columns={sector_add: "SPB"}) new_naics = new_naics.drop(columns=[sector_merge]) new_naics = pd.merge(new_naics, cw[[sector_merge, sector_add]], how='left', left_on=['SectorConsumed_tmp'], right_on=[sector_merge]) new_naics = new_naics.rename(columns={sector_add: "SCB"}) new_naics = new_naics.drop(columns=[sector_merge]) # drop columns and rename new sector columns new_naics = new_naics.drop(columns=["SectorProducedBy", "SectorConsumedBy", "SectorProduced_tmp", "SectorConsumed_tmp"]) new_naics = new_naics.rename(columns={"SPB": "SectorProducedBy", "SCB": "SectorConsumedBy"}) # append new naics to df new_naics['SectorConsumedBy'] = new_naics['SectorConsumedBy'].replace({np.nan: ""}) new_naics['SectorProducedBy'] = new_naics['SectorProducedBy'].replace({np.nan: ""}) new_naics = replace_NoneType_with_empty_cells(new_naics) df = pd.concat([df, new_naics], sort=True) # replace blank strings with None df = replace_strings_with_NoneType(df) return df
def sector_aggregation(df_load, group_cols): """ Function that checks if a sector length exists, and if not, sums the less aggregated sector :param df_load: Either a flowbyactivity df with sectors or a flowbysector df :param group_cols: columns by which to aggregate :return: df, with aggregated sector values """ # determine if activities are sector-like, if aggregating a df with a 'SourceName' sector_like_activities = False if 'SourceName' in df_load.columns: # load source catalog cat = load_source_catalog() # for s in pd.unique(flowbyactivity_df['SourceName']): s = pd.unique(df_load['SourceName'])[0] # load catalog info for source src_info = cat[s] sector_like_activities = src_info['sector-like_activities'] # ensure None values are not strings df = replace_NoneType_with_empty_cells(df_load) # if activities are source like, drop from df and group calls, # add back in as copies of sector columns columns to keep if sector_like_activities: group_cols = [e for e in group_cols if e not in ('ActivityProducedBy', 'ActivityConsumedBy')] # subset df df_cols = [e for e in df.columns if e not in ('ActivityProducedBy', 'ActivityConsumedBy')] df = df[df_cols] # find the longest length sector length = df[[fbs_activity_fields[0], fbs_activity_fields[1]]].apply( lambda x: x.str.len()).max().max() length = int(length) # for loop in reverse order longest length naics minus 1 to 2 # appends missing naics levels to df for i in range(length - 1, 1, -1): # subset df to sectors with length = i and length = i + 1 df_subset = df.loc[df[fbs_activity_fields[0]].apply(lambda x: i + 1 >= len(x) >= i) | df[fbs_activity_fields[1]].apply(lambda x: i + 1 >= len(x) >= i)] # create a list of i digit sectors in df subset sector_subset = df_subset[ ['Location', fbs_activity_fields[0], fbs_activity_fields[1]]].drop_duplicates().reset_index(drop=True) df_sectors = sector_subset.copy() df_sectors.loc[:, 'SectorProducedBy'] = \ df_sectors['SectorProducedBy'].apply(lambda x: x[0:i]) df_sectors.loc[:, 'SectorConsumedBy'] = \ df_sectors['SectorConsumedBy'].apply(lambda x: x[0:i]) sector_list = df_sectors.drop_duplicates().values.tolist() # create a list of sectors that are exactly i digits long # where either sector column is i digits in length df_existing_1 = \ sector_subset.loc[(sector_subset['SectorProducedBy'].apply(lambda x: len(x) == i)) | (sector_subset['SectorConsumedBy'].apply(lambda x: len(x) == i))] # where both sector columns are i digits in length df_existing_2 = \ sector_subset.loc[(sector_subset['SectorProducedBy'].apply(lambda x: len(x) == i)) & (sector_subset['SectorConsumedBy'].apply(lambda x: len(x) == i))] # concat existing dfs df_existing = pd.concat([df_existing_1, df_existing_2], sort=False) existing_sectors = df_existing.drop_duplicates().dropna().values.tolist() # list of sectors of length i that are not in sector list missing_sectors = [e for e in sector_list if e not in existing_sectors] if len(missing_sectors) != 0: # new df of sectors that start with missing sectors. # drop last digit of the sector and sum flows # set conditions agg_sectors_list = [] for q, r, s in missing_sectors: c1 = df_subset['Location'] == q c2 = df_subset[fbs_activity_fields[0]].apply(lambda x: x[0:i] == r) c3 = df_subset[fbs_activity_fields[1]].apply(lambda x: x[0:i] == s) # subset data agg_sectors_list.append(df_subset.loc[c1 & c2 & c3]) agg_sectors = pd.concat(agg_sectors_list, sort=False) agg_sectors = agg_sectors.loc[ (agg_sectors[fbs_activity_fields[0]].apply(lambda x: len(x) > i)) | (agg_sectors[fbs_activity_fields[1]].apply(lambda x: len(x) > i))] agg_sectors.loc[:, fbs_activity_fields[0]] = agg_sectors[fbs_activity_fields[0]].apply( lambda x: x[0:i]) agg_sectors.loc[:, fbs_activity_fields[1]] = agg_sectors[fbs_activity_fields[1]].apply( lambda x: x[0:i]) # aggregate the new sector flow amounts agg_sectors = aggregator(agg_sectors, group_cols) # append to df agg_sectors = replace_NoneType_with_empty_cells(agg_sectors) df = df.append(agg_sectors, sort=False).reset_index(drop=True) # manually modify non-NAICS codes that might exist in sector df.loc[:, 'SectorConsumedBy'] = np.where(df['SectorConsumedBy'].isin(['F0', 'F01']), 'F010', df['SectorConsumedBy']) # domestic/household df.loc[:, 'SectorProducedBy'] = np.where(df['SectorProducedBy'].isin(['F0', 'F01']), 'F010', df['SectorProducedBy']) # domestic/household # drop any duplicates created by modifying sector codes df = df.drop_duplicates() # if activities are source-like, set col values as copies of the sector columns if sector_like_activities: df = df.assign(ActivityProducedBy=df['SectorProducedBy']) df = df.assign(ActivityConsumedBy=df['SectorConsumedBy']) # reindex columns df = df.reindex(df_load.columns, axis=1) # replace null values df = replace_strings_with_NoneType(df) return df
def equally_allocate_suppressed_parent_to_child_naics(df_load, sector_column, groupcols): """ Estimate data suppression, by equally allocating parent NAICS values to child NAICS :param df_load: df with sector columns :param sector_column: str, column to estimate suppressed data for :param groupcols: list, columns to group df by :return: df, with estimated suppressed data """ df = sector_disaggregation(df_load) df = replace_NoneType_with_empty_cells(df) df = df[df[sector_column] != ''] # determine if activities are sector-like, # if aggregating a df with a 'SourceName' sector_like_activities = False if 'SourceName' in df_load.columns: s = pd.unique(df_load['SourceName'])[0] sector_like_activities = check_activities_sector_like(s) # if activities are source like, drop from df, # add back in as copies of sector columns columns to keep if sector_like_activities: # subset df df_cols = [ e for e in df.columns if e not in ('ActivityProducedBy', 'ActivityConsumedBy') ] df = df[df_cols] # drop activity from groupby groupcols = [ e for e in groupcols if e not in ['ActivityConsumedBy', 'ActivityProducedBy', 'Description'] ] # load naics 2 to naics 6 crosswalk cw_load = load_crosswalk('sector_length') cw_melt = cw_load.melt(id_vars=["NAICS_6"], var_name="NAICS_Length", value_name="NAICS_Match").drop( columns=['NAICS_Length']).drop_duplicates() df_sup = df[df['FlowAmount'] == 0].reset_index(drop=True) # merge the naics cw new_naics = pd.merge(df_sup, cw_melt, how='left', left_on=[sector_column], right_on=['NAICS_Match']) # drop rows where match is null because no additional naics to add new_naics = new_naics.dropna() new_naics[sector_column] = new_naics['NAICS_6'].copy() new_naics = new_naics.drop(columns=['NAICS_6', 'NAICS_Match']) # merge the new naics with the existing df, if data already # existed for a NAICS6, keep the original dfm = pd.merge(new_naics[groupcols], df, how='left', on=groupcols, indicator=True).query('_merge=="left_only"').drop('_merge', axis=1) dfm = replace_NoneType_with_empty_cells(dfm) dfm = dfm.fillna(0) df = pd.concat([df, dfm], sort=True, ignore_index=True) # add length column and subset the data # subtract out existing data at NAICS6 from total data # at a length where no suppressed data df = df.assign(secLength=df[sector_column].apply(lambda x: len(x))) # add column for each state of sector length where # there are no missing values df_sup = df_sup.assign( secLength=df_sup[sector_column].apply(lambda x: len(x))) df_sup2 = (df_sup.groupby( ['FlowName', 'Compartment', 'Location'])['secLength'].agg(lambda x: x.min() - 1).reset_index( name='secLengthsup')) # merge the dfs and sub out the last sector lengths with # all data for each state drop states that don't have suppressed dat df1 = df.merge(df_sup2) df2 = df1[df1['secLength'] == 6].reset_index(drop=True) # determine sector to merge on df2.loc[:, 'mergeSec'] = df2.apply( lambda x: x[sector_column][:x['secLengthsup']], axis=1) sum_cols = [ e for e in fba_default_grouping_fields if e not in ['ActivityConsumedBy', 'ActivityProducedBy'] ] sum_cols.append('mergeSec') df2 = df2.assign( FlowAlloc=df2.groupby(sum_cols)['FlowAmount'].transform('sum')) # rename columns for the merge and define merge cols df2 = df2.rename(columns={ sector_column: 'NewNAICS', 'mergeSec': sector_column }) # keep flows with 0 flow df3 = df2[df2['FlowAmount'] == 0].reset_index(drop=True) m_cols = groupcols + ['NewNAICS', 'FlowAlloc'] # merge the two dfs dfe = df1.merge(df3[m_cols]) # add count column used to divide the unallocated flows dfe = dfe.assign( secCount=dfe.groupby(groupcols)['NewNAICS'].transform('count')) dfe = dfe.assign(newFlow=(dfe['FlowAmount'] - dfe['FlowAlloc']) / dfe['secCount']) # reassign values and drop columns dfe = dfe.assign(FlowAmount=dfe['newFlow']) dfe[sector_column] = dfe['NewNAICS'].copy() dfe = dfe.drop(columns=['NewNAICS', 'FlowAlloc', 'secCount', 'newFlow']) # new df with estimated naics6 dfn = pd.concat([df, dfe], ignore_index=True) dfn2 = dfn[dfn['FlowAmount'] != 0].reset_index(drop=True) dfn2 = dfn2.drop(columns=['secLength']) dff = sector_aggregation(dfn2, fba_wsec_default_grouping_fields) # if activities are source-like, set col values as copies # of the sector columns if sector_like_activities: dff = dff.assign(ActivityProducedBy=dff['SectorProducedBy']) dff = dff.assign(ActivityConsumedBy=dff['SectorConsumedBy']) # reindex columns dff = dff.reindex(df_load.columns, axis=1) # replace null values dff = replace_strings_with_NoneType(dff).reset_index(drop=True) return dff
def compare_fba_geo_subset_and_fbs_output_totals(fba_load, fbs_load, activity_set, source_name, source_attr, activity_attr, method): """ Function to compare the loaded flowbyactivity total after subsetting by activity and geography with the final flowbysector output total. Not a direct comparison of the loaded FBA because FBAs are modified before being subset by activity for the target sector level :param fba_load: df, FBA loaded, before being mapped :param fbs_load: df, final FBS df at target sector level :param activity_set: str, activity set :param source_name: str, source name :param source_attr: dictionary, attribute data from method yaml for source data :param activity_attr: dictionary, attribute data from method yaml for activity set :param method: dictionary, FBS method yaml :return: printout data differences between loaded FBA and FBS output totals by location, save results as csv in local directory """ vLog.info('Comparing Flow-By-Activity subset by activity and geography to ' 'the subset Flow-By-Sector FlowAmount total.') # determine from scale if fips_number_key[source_attr['geoscale_to_use']] < \ fips_number_key[activity_attr['allocation_from_scale']]: from_scale = source_attr['geoscale_to_use'] else: from_scale = activity_attr['allocation_from_scale'] # extract relevant geoscale data or aggregate existing data fba = subset_df_by_geoscale(fba_load, from_scale, method['target_geoscale']) if check_activities_sector_like(source_name): # if activities are sector-like, run sector aggregation and then # subset df to only keep NAICS2 fba = fba[[ 'Class', 'FlowAmount', 'Unit', 'Context', 'ActivityProducedBy', 'ActivityConsumedBy', 'Location', 'LocationSystem' ]] # rename the activity cols to sector cols for purposes of aggregation fba = fba.rename( columns={ 'ActivityProducedBy': 'SectorProducedBy', 'ActivityConsumedBy': 'SectorConsumedBy' }) group_cols_agg = [ 'Class', 'Context', 'Unit', 'Location', 'LocationSystem', 'SectorProducedBy', 'SectorConsumedBy' ] fba = sector_aggregation(fba, group_cols_agg) # subset fba to only include NAICS2 fba = replace_NoneType_with_empty_cells(fba) fba = fba[fba['SectorConsumedBy'].apply(lambda x: len(x) == 2) | fba['SectorProducedBy'].apply(lambda x: len(x) == 2)] # subset/agg dfs col_subset = [ 'Class', 'FlowAmount', 'Unit', 'Context', 'Location', 'LocationSystem' ] group_cols = ['Class', 'Unit', 'Context', 'Location', 'LocationSystem'] # check units compare_df_units(fba, fbs_load) # fba fba = fba[col_subset] fba_agg = aggregator(fba, group_cols).reset_index(drop=True) fba_agg.rename(columns={ 'FlowAmount': 'FBA_amount', 'Unit': 'FBA_unit' }, inplace=True) # fbs fbs = fbs_load[col_subset] fbs_agg = aggregator(fbs, group_cols) fbs_agg.rename(columns={ 'FlowAmount': 'FBS_amount', 'Unit': 'FBS_unit' }, inplace=True) try: # merge FBA and FBS totals df_merge = fba_agg.merge(fbs_agg, how='left') df_merge['FlowAmount_difference'] = \ df_merge['FBA_amount'] - df_merge['FBS_amount'] df_merge['Percent_difference'] = \ (df_merge['FlowAmount_difference']/df_merge['FBA_amount']) * 100 # reorder df_merge = df_merge[[ 'Class', 'Context', 'Location', 'LocationSystem', 'FBA_amount', 'FBA_unit', 'FBS_amount', 'FBS_unit', 'FlowAmount_difference', 'Percent_difference' ]] df_merge = replace_NoneType_with_empty_cells(df_merge) # list of contexts and locations context_list = df_merge[['Context', 'Location']].values.tolist() # loop through the contexts and print results of comparison vLog.info( 'Comparing FBA %s %s subset to FBS results. ' 'Details in Validation Log', activity_set, source_attr['geoscale_to_use']) for i, j in context_list: df_merge_subset = \ df_merge[(df_merge['Context'] == i) & (df_merge['Location'] == j)].reset_index(drop=True) diff_per = df_merge_subset['Percent_difference'][0] if np.isnan(diff_per): vLog.info( 'FlowBySector FlowAmount for %s %s %s ' 'does not exist in the FBS', source_name, activity_set, i) continue # make reporting more manageable if abs(diff_per) > 0.01: diff_per = round(diff_per, 2) else: diff_per = round(diff_per, 6) # diff_units = df_merge_subset['FBS_unit'][0] if diff_per > 0: vLog.info( 'FlowBySector FlowAmount for %s %s %s at %s is %s%% ' 'less than the FlowByActivity FlowAmount', source_name, activity_set, i, j, str(abs(diff_per))) elif diff_per < 0: vLog.info( 'FlowBySector FlowAmount for %s %s %s at %s is %s%% ' 'more than the FlowByActivity FlowAmount', source_name, activity_set, i, j, str(abs(diff_per))) elif diff_per == 0: vLogDetailed.info( 'FlowBySector FlowAmount for ' '%s %s %s at %s is equal to the ' 'FlowByActivity FlowAmount', source_name, activity_set, i, j) # subset the df to include in the validation log # only print rows where the percent difference does not round to 0 df_v = df_merge[df_merge['Percent_difference'].apply( lambda x: round(x, 3) != 0)].reset_index(drop=True) # log output log.info( 'Save the comparison of FlowByActivity load to FlowBySector ' 'total FlowAmounts for %s in validation log file', activity_set) # if df not empty, print, if empty, print string if df_v.empty: vLogDetailed.info('Percent difference for %s all round to 0', activity_set) else: vLogDetailed.info( 'Comparison of FBA load to FBS total ' 'FlowAmounts for %s: ' '\n {}'.format(df_v.to_string()), activity_set) except: vLog.info('Error occurred when comparing total FlowAmounts ' 'for FlowByActivity and FlowBySector')
def determine_flows_requiring_disaggregation(df_load, attr, method, sector_column): """ The MECS Land data provides FlowAmounts for NAICS3-6. We use BLS QCEW employment data to determine land use for different industries. To accurately estimate land use per industry, existing FlowAmounts for a particular NAICS level (NAICS6) for example, should be subtracted from the possible FlowAmounts for other NAICS6 that share the first 5 digits. For Example, there is data for '311', '3112', and '311221' in the 2014 dataset. FlowAmounts for allocation by employment for NAICS6 are based on the provided '3112' FlowAmounts. However, since there is data at one NAICS6 (311221), the FlowAmount for that NAICS6 should be subtracted from other NAICS6 to accurately depict the remaining 'FlowAmount' that requires a secondary source (Employment data) for allocation. :param df_load: df, EIA MECS Land FBA :param attr: dictionary, attribute data from method yaml for activity set :param method: dictionary, FBS method yaml :param sector_column: str, sector column to flag ('SectorProducedBy', 'SectorConsumedBy') :return: A dataframe with a column 'disaggregate_flag', if '1', row requires secondary source to calculate FlowAmount, if '0' FlowAmount does not require modifications """ from flowsa.sectormapping import add_sectors_to_flowbyactivity df_load = replace_NoneType_with_empty_cells(df_load) # drop rows where there is no value in sector column, which might occur if # sector-like activities have a "-" in them df_load = df_load[df_load[sector_column] != ''] # determine activity column if sector_column == 'SectorConsumedBy': activity_column = 'ActivityConsumedBy' else: activity_column = 'ActivityProducedBy' # original df - subset # subset cols of original df dfo = df_load[['FlowAmount', 'Location', sector_column]] # min and max length min_length = min( df_load[sector_column].apply(lambda x: len(str(x))).unique()) max_length = max( df_load[sector_column].apply(lambda x: len(str(x))).unique()) # subset by sector length, creating a df for s in range(min_length, max_length + 1): df_name = 'dfo_naics' + str(s) vars()[df_name] = dfo[dfo[sector_column].apply( lambda x: len(x) == s)].reset_index(drop=True) vars()[df_name] = vars()[df_name].assign(SectorMatch=vars( )[df_name][sector_column].apply(lambda x: x[:len(x) - 1])) # loop through the dfs, merging by sector match. If there is a match, # subtract the value, if there is not a match, drop last digit in # sectormatch, add row to the next df, and repeat df_merged = pd.DataFrame() df_not_merged = pd.DataFrame() for s in range(max_length, min_length, -1): df_name_1 = 'dfo_naics' + str(s - 1) df_name_2 = 'dfo_naics' + str(s) # concat df 1 with df_not_merged df2 = pd.concat([vars()[df_name_2], df_not_merged]) df2 = df2.rename(columns={ 'FlowAmount': 'SubtractFlow', sector_column: 'Sector' }) df_m = pd.merge( vars()[df_name_1][['FlowAmount', 'Location', sector_column]], df2, left_on=['Location', sector_column], right_on=['Location', 'SectorMatch'], indicator=True, how='outer') # subset by merge and append to appropriate df df_both = df_m[df_m['_merge'] == 'both'] if len(df_both) != 0: # drop columns df_both1 = df_both.drop( columns=['Sector', 'SectorMatch', '_merge']) # aggregate before subtracting df_both2 = df_both1.groupby( ['FlowAmount', 'Location', sector_column], as_index=False).agg({"SubtractFlow": sum}) df_both3 = df_both2.assign(FlowAmount=df_both2['FlowAmount'] - df_both2['SubtractFlow']) df_both3 = df_both3.drop(columns=['SubtractFlow']) # drop rows where 0 # df_both = df_both[df_both['FlowAmount'] != 0] df_merged = df_merged.append(df_both3, ignore_index=True) df_right = df_m[df_m['_merge'] == 'right_only'] if len(df_right) != 0: df_right = df_right.drop( columns=['FlowAmount', sector_column, '_merge']) df_right = df_right.rename(columns={ 'SubtractFlow': 'FlowAmount', 'Sector': sector_column }) # remove another digit from Sectormatch df_right = df_right.assign(SectorMatch=df_right[sector_column]. apply(lambda x: x[:(s - 2)])) # reorder df_right = df_right[[ 'FlowAmount', 'Location', sector_column, 'SectorMatch' ]] df_not_merged = df_not_merged.append(df_right, ignore_index=True) # rename the flowamount column df_merged = df_merged.rename(columns={ 'FlowAmount': 'FlowAmountNew', sector_column: activity_column }) # In the original EIA MECS df, some of the NAICS 6-digit codes sum # to a value greater than published NAICS3, due to rounding. In these # cases, the new FlowAmount is a negative number. Reset neg numbers to 0 df_merged.loc[df_merged['FlowAmountNew'] < 0, 'FlowAmountNew'] = 0 # in the original df, drop sector columns re-add sectors, this time with # sectors = 'aggregated' dfn = df_load.drop(columns=[ 'SectorProducedBy', 'ProducedBySectorType', 'SectorConsumedBy', 'ConsumedBySectorType', 'SectorSourceName' ]) dfn = add_sectors_to_flowbyactivity( dfn, sectorsourcename=method['target_sector_source'], overwrite_sectorlevel='aggregated') # add column noting that these columns require an allocation ratio dfn = dfn.assign(disaggregate_flag=1) # create lists of sectors to drop list_original = df_load[activity_column].drop_duplicates().tolist() # drop values in original df dfn2 = dfn[~dfn[sector_column].isin(list_original)].sort_values( [activity_column, sector_column]).reset_index(drop=True) # drop the sectors that are duplicated by different naics being # mapped to naics6 if len(dfn2[dfn2.duplicated(subset=['Location', sector_column], keep=False)]) > 0: dfn2.drop_duplicates(subset=['Location', sector_column], keep='last', inplace=True) # want to allocate at NAICS6, so drop all other sectors dfn2 = \ dfn2[dfn2[sector_column].apply(lambda x: len(x) == 6)].reset_index( drop=True).sort_values([sector_column]) # merge revised flowamounts back with modified original df df_to_allocate = dfn2.merge(df_merged, how='left') # replace FlowAmount with newly calculated FlowAmount, # which represents Flows that are currently unaccounted for at NAICS6 df_to_allocate['FlowAmount'] = np.where( df_to_allocate['FlowAmountNew'].notnull(), df_to_allocate['FlowAmountNew'], df_to_allocate['FlowAmount']) # drop rows where flow amount = 0 - flows are captured through other NAICS6 df_to_allocate2 = df_to_allocate[df_to_allocate['FlowAmount'] != 0].drop( columns='FlowAmountNew').reset_index(drop=True) # merge the original df with modified # add column to original df for disaggregate_flag df_load = df_load.assign(disaggregate_flag=0) # concat the two dfs and sort df_c = pd.concat([df_load, df_to_allocate2], ignore_index=True).sort_values([sector_column ]).reset_index(drop=True) df_c = replace_strings_with_NoneType(df_c).sort_values([sector_column]) return df_c
def proportional_allocation_by_location_and_activity(df_load, sectorcolumn): """ Creates a proportional allocation within each aggregated sector within a location :param df_load: df with sector columns :param sectorcolumn: str, sector column for which to create allocation ratios :return: df, with 'FlowAmountRatio' and 'HelperFlow' columns """ # tmp replace NoneTypes with empty cells df = replace_NoneType_with_empty_cells(df_load).reset_index(drop=True) # want to create denominator based on shortest length naics for each # activity/location grouping_cols = [ e for e in [ 'FlowName', 'Location', 'Activity', 'ActivityConsumedBy', 'ActivityProducedBy', 'Class', 'SourceName', 'Unit', 'FlowType', 'Compartment', 'Year' ] if e in df.columns.values.tolist() ] activity_cols = [ e for e in ['Activity', 'ActivityConsumedBy', 'ActivityProducedBy'] if e in df.columns.values.tolist() ] # trim whitespace df[sectorcolumn] = df[sectorcolumn].str.strip() # to create the denominator dataframe first add a column that captures # the sector length denom_df = df.assign(sLen=df[sectorcolumn].str.len()) denom_df = denom_df[denom_df['sLen'] == denom_df.groupby(activity_cols) ['sLen'].transform(min)].drop(columns='sLen') denom_df.loc[:, 'Denominator'] = \ denom_df.groupby(grouping_cols)['HelperFlow'].transform('sum') # list of column headers, that if exist in df, should be aggregated # using the weighted avg fxn possible_column_headers = ('Location', 'LocationSystem', 'Year', 'Activity', 'ActivityConsumedBy', 'ActivityProducedBy') # list of column headers that do exist in the df being aggregated column_headers = [ e for e in possible_column_headers if e in denom_df.columns.values.tolist() ] merge_headers = column_headers.copy() column_headers.append('Denominator') # create subset of denominator values based on Locations and Activities denom_df_2 = \ denom_df[column_headers].drop_duplicates().reset_index(drop=True) # merge the denominator column with fba_w_sector df allocation_df = df.merge(denom_df_2, how='left', left_on=merge_headers, right_on=merge_headers) # calculate ratio allocation_df.loc[:, 'FlowAmountRatio'] = \ allocation_df['HelperFlow'] / allocation_df['Denominator'] allocation_df = allocation_df.drop(columns=['Denominator']).reset_index( drop=True) # where parent NAICS are not found in the allocation dataset, make sure # those child NAICS are not dropped allocation_df['FlowAmountRatio'] = \ allocation_df['FlowAmountRatio'].fillna(1) # fill empty cols with NoneType allocation_df = replace_strings_with_NoneType(allocation_df) # fill na values with 0 allocation_df['HelperFlow'] = allocation_df['HelperFlow'].fillna(0) return allocation_df
def disaggregate_cropland(fba_w_sector, attr, method, year, sector_column): """ In the event there are 4 (or 5) digit naics for cropland at the county level, use state level harvested cropland to create ratios :param fba_w_sector: df, CoA cropland data, FBA format with sector columns :param attr: dictionary, attribute data from method yaml for activity set :param year: str, year of data :param sector_column: str, the sector column on which to make df modifications (SectorProducedBy or SectorConsumedBy) :param attr: dictionary, attribute data from method yaml for activity set :return: df, CoA cropland data disaggregated """ # tmp drop NoneTypes fba_w_sector = replace_NoneType_with_empty_cells(fba_w_sector) # drop pastureland data crop = fba_w_sector.loc[fba_w_sector[sector_column].apply(lambda x: x[0:3]) != '112'].reset_index(drop=True) # drop sectors < 4 digits crop = crop[crop[sector_column].apply(lambda x: len(x) > 3)].reset_index( drop=True) # create tmp location crop = crop.assign(Location_tmp=crop['Location'].apply(lambda x: x[0:2])) # load the relevant state level harvested cropland by naics naics = load_fba_w_standardized_units(datasource="USDA_CoA_Cropland_NAICS", year=year, flowclass='Land') # subset the harvested cropland by naics naics = naics[naics['FlowName'] == 'AG LAND, CROPLAND, HARVESTED'].reset_index(drop=True) # drop the activities that include '&' naics = naics[~naics['ActivityConsumedBy'].str.contains('&')].reset_index( drop=True) # add sectors naics = add_sectors_to_flowbyactivity( naics, sectorsourcename=method['target_sector_source']) # estimate suppressed data by equally allocating parent to child naics naics = estimate_suppressed_data(naics, 'SectorConsumedBy', 3, 'USDA_CoA_Cropland_NAICS') # add missing fbs fields naics = clean_df(naics, flow_by_sector_fields, fbs_fill_na_dict) # aggregate sectors to create any missing naics levels group_cols = fbs_default_grouping_fields # group_cols = [e for e in group_cols if e not in ('SectorProducedBy', 'SectorConsumedBy')] # group_cols.append(sector_column) naics2 = sector_aggregation(naics, group_cols) # add missing naics5/6 when only one naics5/6 associated with a naics4 naics3 = sector_disaggregation(naics2) # drop rows where FlowAmount 0 # naics3 = naics3[~((naics3['SectorProducedBy'] == '') & (naics3['SectorConsumedBy'] == ''))] naics3 = naics3.loc[naics3['FlowAmount'] != 0] # create ratios naics4 = sector_ratios(naics3, sector_column) # create temporary sector column to match the two dfs on naics4 = naics4.assign( Location_tmp=naics4['Location'].apply(lambda x: x[0:2])) # tmp drop Nonetypes naics4 = replace_NoneType_with_empty_cells(naics4) # check units in prep for merge compare_df_units(crop, naics4) # for loop through naics lengths to determine naics 4 and 5 digits to disaggregate for i in range(4, 6): # subset df to sectors with length = i and length = i + 1 crop_subset = crop.loc[crop[sector_column].apply( lambda x: i + 1 >= len(x) >= i)] crop_subset = crop_subset.assign( Sector_tmp=crop_subset[sector_column].apply(lambda x: x[0:i])) # if duplicates drop all rows df = crop_subset.drop_duplicates(subset=['Location', 'Sector_tmp'], keep=False).reset_index(drop=True) # drop sector temp column df = df.drop(columns=["Sector_tmp"]) # subset df to keep the sectors of length i df_subset = df.loc[df[sector_column].apply(lambda x: len(x) == i)] # subset the naics df where naics length is i + 1 naics_subset = \ naics4.loc[naics4[sector_column].apply(lambda x: len(x) == i + 1)].reset_index(drop=True) naics_subset = naics_subset.assign( Sector_tmp=naics_subset[sector_column].apply(lambda x: x[0:i])) # merge the two df based on locations df_subset = pd.merge(df_subset, naics_subset[[ sector_column, 'FlowAmountRatio', 'Sector_tmp', 'Location_tmp' ]], how='left', left_on=[sector_column, 'Location_tmp'], right_on=['Sector_tmp', 'Location_tmp']) # create flow amounts for the new NAICS based on the flow ratio df_subset.loc[:, 'FlowAmount'] = df_subset['FlowAmount'] * df_subset[ 'FlowAmountRatio'] # drop rows of 0 and na df_subset = df_subset[df_subset['FlowAmount'] != 0] df_subset = df_subset[~df_subset['FlowAmount'].isna()].reset_index( drop=True) # drop columns df_subset = df_subset.drop( columns=[sector_column + '_x', 'FlowAmountRatio', 'Sector_tmp']) # rename columns df_subset = df_subset.rename( columns={sector_column + '_y': sector_column}) # tmp drop Nonetypes df_subset = replace_NoneType_with_empty_cells(df_subset) # add new rows of data to crop df crop = pd.concat([crop, df_subset], sort=True).reset_index(drop=True) # clean up df crop = crop.drop(columns=['Location_tmp']) # equally allocate any further missing naics crop = allocate_dropped_sector_data(crop, 'NAICS_6') # pasture data pasture = \ fba_w_sector.loc[fba_w_sector[sector_column].apply(lambda x: x[0:3]) == '112'].reset_index(drop=True) # concat crop and pasture fba_w_sector = pd.concat([pasture, crop], sort=True).reset_index(drop=True) # fill empty cells with NoneType fba_w_sector = replace_strings_with_NoneType(fba_w_sector) return fba_w_sector
def compare_activity_to_sector_flowamounts(fba_load, fbs_load, activity_set, source_name, config): """ Function to compare the loaded flowbyactivity with the final flowbysector by activityname (if exists) to target sector level output, checking for data loss :param fba_load: df, FBA loaded and mapped using FEDEFL :param fbs_load: df, final FBS df :param activity_set: str, activity set :param source_name: str, source name :param config: dictionary, method yaml :return: printout data differences between loaded FBA and FBS output, save results as csv in local directory """ if check_activities_sector_like(source_name): vLog.debug('Not comparing loaded FlowByActivity to FlowBySector ' 'ratios for a dataset with sector-like activities because ' 'if there are modifications to flowamounts for a sector, ' 'then the ratios will be different') else: # subset fba df fba = fba_load[[ 'Class', 'MetaSources', 'Flowable', 'Unit', 'FlowType', 'ActivityProducedBy', 'ActivityConsumedBy', 'Context', 'Location', 'LocationSystem', 'Year', 'FlowAmount' ]].drop_duplicates().reset_index(drop=True) fba.loc[:, 'Location'] = US_FIPS group_cols = [ 'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit', 'FlowType', 'Context', 'Location', 'LocationSystem', 'Year' ] fba_agg = aggregator(fba, group_cols) fba_agg.rename(columns={'FlowAmount': 'FBA_amount'}, inplace=True) # subset fbs df fbs = fbs_load[[ 'Class', 'SectorSourceName', 'Flowable', 'Unit', 'FlowType', 'SectorProducedBy', 'SectorConsumedBy', 'ActivityProducedBy', 'ActivityConsumedBy', 'Context', 'Location', 'LocationSystem', 'Year', 'FlowAmount' ]].drop_duplicates().reset_index(drop=True) fbs = replace_NoneType_with_empty_cells(fbs) fbs['ProducedLength'] = fbs['SectorProducedBy'].str.len() fbs['ConsumedLength'] = fbs['SectorConsumedBy'].str.len() fbs['SectorLength'] = fbs[['ProducedLength', 'ConsumedLength']].max(axis=1) fbs.loc[:, 'Location'] = US_FIPS group_cols = [ 'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit', 'FlowType', 'Context', 'Location', 'LocationSystem', 'Year', 'SectorLength' ] fbs_agg = aggregator(fbs, group_cols) fbs_agg.rename(columns={'FlowAmount': 'FBS_amount'}, inplace=True) # merge compare 1 and compare 2 df_merge = fba_agg.merge(fbs_agg, left_on=[ 'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit', 'FlowType', 'Context', 'Location', 'LocationSystem', 'Year' ], right_on=[ 'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit', 'FlowType', 'Context', 'Location', 'LocationSystem', 'Year' ], how='left') df_merge['Ratio'] = df_merge['FBS_amount'] / df_merge['FBA_amount'] # reorder df_merge = df_merge[[ 'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit', 'FlowType', 'Context', 'Location', 'LocationSystem', 'Year', 'SectorLength', 'FBA_amount', 'FBS_amount', 'Ratio' ]] # keep onlyrows of specified sector length comparison = df_merge[df_merge['SectorLength'] == sector_level_key[ config['target_sector_level']]].reset_index(drop=True) tolerance = 0.01 comparison2 = comparison[(comparison['Ratio'] < 1 - tolerance) | (comparison['Ratio'] > 1 + tolerance)] if len(comparison2) > 0: vLog.info( 'There are %s combinations of flowable/context/sector ' 'length where the flowbyactivity to flowbysector ratio ' 'is less than or greater than 1 by %s', len(comparison2), str(tolerance)) # include df subset in the validation log # only print rows where flowamount ratio is less t # han 1 (round flowamountratio) df_v = comparison2[comparison2['Ratio'].apply( lambda x: round(x, 3) < 1)].reset_index(drop=True) # save to validation log log.info( 'Save the comparison of FlowByActivity load ' 'to FlowBySector ratios for %s in validation log', activity_set) # if df not empty, print, if empty, print string if df_v.empty: vLogDetailed.info('Ratios for %s all round to 1', activity_set) else: vLogDetailed.info( 'Comparison of FlowByActivity load to ' 'FlowBySector ratios for %s: ' '\n {}'.format(df_v.to_string()), activity_set)
def check_for_missing_sector_data(df, target_sector_level): """ Modeled after validation.py check_if_losing_sector_data Allocates flow amount equally across child NAICS when parent NAICS is not target_level :param df: df :param target_sector_level: str, final sector level of FBS (ex. NAICS_6) :return: df with missing sector level data """ from flowsa.dataclean import replace_NoneType_with_empty_cells from flowsa.dataclean import replace_strings_with_NoneType # temporarily replace null values with empty cells df = replace_NoneType_with_empty_cells(df) activity_field = "SectorProducedBy" rows_lost = pd.DataFrame() cw_load = load_crosswalk('sector_length') for i in range(3, sector_level_key[target_sector_level]): # create df of i length df_subset = df.loc[df[activity_field].apply(lambda x: len(x) == i)] # import cw and subset to current sector length and # target sector length nlength = list(sector_level_key.keys())[list( sector_level_key.values()).index(i)] cw = cw_load[[nlength, target_sector_level]].drop_duplicates() # add column with counts cw['sector_count'] = cw.groupby(nlength)[nlength].transform('count') # merge df & replace sector produced columns df_x = pd.merge(df_subset, cw, how='left', left_on=[activity_field], right_on=[nlength]) df_x[activity_field] = df_x[target_sector_level] df_x = df_x.drop(columns=[nlength, target_sector_level]) # calculate new flow amounts, based on sector count, # allocating equally to the new sector length codes df_x['FlowAmount'] = df_x['FlowAmount'] / df_x['sector_count'] df_x = df_x.drop(columns=['sector_count']) # replace null values with empty cells df_x = replace_NoneType_with_empty_cells(df_x) # append to df sector_list = df_subset[activity_field].drop_duplicates() if len(df_x) != 0: log.warning( 'Data found at %s digit NAICS to be allocated: ' '{}'.format(' '.join(map(str, sector_list))), str(i)) rows_lost = rows_lost.append(df_x, ignore_index=True, sort=True) if len(rows_lost) == 0: log.info('No data loss from NAICS in dataframe') else: log.info('Allocating FlowAmounts equally to each %s', target_sector_level) # add rows of missing data to the fbs sector subset df_allocated = pd.concat([df, rows_lost], ignore_index=True, sort=True) df_allocated = df_allocated.loc[df_allocated[activity_field].apply( lambda x: len(x) == sector_level_key[target_sector_level])] df_allocated.reset_index(inplace=True) # replace empty cells with NoneType (if dtype is object) df_allocated = replace_strings_with_NoneType(df_allocated) return df_allocated
def replace_naics_w_naics_from_another_year(df_load, sectorsourcename): """ Replace any non sectors with sectors. :param df_load: df with sector columns or sector-like activities :param sectorsourcename: str, sector source name (ex. NAICS_2012_Code) :return: df, with non-sectors replaced with sectors """ # from flowsa.flowbyfunctions import aggregator # drop NoneType df = replace_NoneType_with_empty_cells(df_load).reset_index(drop=True) # load the mastercroswalk and subset by sectorsourcename, # save values to list cw_load = load_crosswalk('sector_timeseries') cw = cw_load[sectorsourcename].drop_duplicates().tolist() # load melted crosswalk cw_melt = melt_naics_crosswalk() # drop the count column cw_melt = cw_melt.drop(columns='naics_count') # determine which headers are in the df if 'SectorConsumedBy' in df: column_headers = ['SectorProducedBy', 'SectorConsumedBy'] else: column_headers = ['ActivityProducedBy', 'ActivityConsumedBy'] # check if there are any sectors that are not in the naics 2012 crosswalk non_naics = check_if_sectors_are_naics(df, cw, column_headers) # loop through the df headers and determine if value is # not in crosswalk list if len(non_naics) != 0: vLog.debug( 'Checking if sectors represent a different ' 'NAICS year, if so, replace with %s', sectorsourcename) for c in column_headers: # merge df with the melted sector crosswalk df = df.merge(cw_melt, left_on=c, right_on='NAICS', how='left') # if there is a value in the sectorsourcename column, # use that value to replace sector in column c if value in # column c is in the non_naics list df[c] = np.where((df[c] == df['NAICS']) & (df[c].isin(non_naics)), df[sectorsourcename], df[c]) # multiply the FlowAmount col by allocation_ratio df.loc[df[c] == df[sectorsourcename], 'FlowAmount'] = df['FlowAmount'] * df['allocation_ratio'] # drop columns df = df.drop( columns=[sectorsourcename, 'NAICS', 'allocation_ratio']) vLog.debug('Replaced NAICS with %s', sectorsourcename) # check if there are any sectors that are not in # the naics 2012 crosswalk vLog.debug('Check again for non NAICS 2012 Codes') nonsectors = check_if_sectors_are_naics(df, cw, column_headers) if len(nonsectors) != 0: vLog.debug('Dropping non-NAICS from dataframe') for c in column_headers: # drop rows where column value is in the nonnaics list df = df[~df[c].isin(nonsectors)] # aggregate data possible_column_headers = \ ('FlowAmount', 'Spread', 'Min', 'Max', 'DataReliability', 'TemporalCorrelation', 'GeographicalCorrelation', 'TechnologicalCorrelation', 'DataCollection', 'Description') # list of column headers to group aggregation by groupby_cols = [ e for e in df.columns.values.tolist() if e not in possible_column_headers ] df = aggregator(df, groupby_cols) # drop rows where both SectorConsumedBy and SectorProducedBy NoneType if 'SectorConsumedBy' in df: df_drop = df[(df['SectorConsumedBy'].isnull()) & (df['SectorProducedBy'].isnull())] if len(df_drop) != 0: activities_dropped = pd.unique( df_drop[['ActivityConsumedBy', 'ActivityProducedBy']].values.ravel('K')) activities_dropped = list( filter(lambda x: x is not None, activities_dropped)) vLog.debug('Dropping rows where the Activity columns contain %s', ', '.join(activities_dropped)) df = df[~((df['SectorConsumedBy'].isnull()) & (df['SectorProducedBy'].isnull()))].reset_index(drop=True) else: df = df[~((df['ActivityConsumedBy'].isnull()) & (df['ActivityProducedBy'].isnull()))].reset_index(drop=True) df = replace_strings_with_NoneType(df) return df
def sector_aggregation(df_load, group_cols): """ Function that checks if a sector length exists, and if not, sums the less aggregated sector :param df_load: Either a flowbyactivity df with sectors or a flowbysector df :param group_cols: columns by which to aggregate :return: df, with aggregated sector values """ # ensure None values are not strings df = replace_NoneType_with_empty_cells(df_load) # determine if activities are sector-like, # if aggregating a df with a 'SourceName' sector_like_activities = False if 'SourceName' in df_load.columns: s = pd.unique(df_load['SourceName'])[0] sector_like_activities = check_activities_sector_like(s) # if activities are source like, drop from df and group calls, # add back in as copies of sector columns columns to keep if sector_like_activities: group_cols = [ e for e in group_cols if e not in ('ActivityProducedBy', 'ActivityConsumedBy') ] # subset df df_cols = [ e for e in df.columns if e not in ('ActivityProducedBy', 'ActivityConsumedBy') ] df = df[df_cols] # find the longest length sector length = df[[fbs_activity_fields[0], fbs_activity_fields[1] ]].apply(lambda x: x.str.len()).max().max() length = int(length) # for loop in reverse order longest length naics minus 1 to 2 # appends missing naics levels to df for i in range(length, 2, -1): # df where either sector column is length or both columns are df1 = df[((df['SectorProducedBy'].apply(lambda x: len(x) == i)) | (df['SectorConsumedBy'].apply(lambda x: len(x) == i))) | ((df['SectorProducedBy'].apply(lambda x: len(x) == i)) & (df['SectorConsumedBy'].apply(lambda x: len(x) == i)))] # add new columns dropping last digit of sectors df1 = df1.assign( SPB=df1['SectorProducedBy'].apply(lambda x: x[0:i - 1])) df1 = df1.assign( SCB=df1['SectorConsumedBy'].apply(lambda x: x[0:i - 1])) # second dataframe where length is l - 1 df2 = df[((df['SectorProducedBy'].apply(lambda x: len(x) == i - 1)) | (df['SectorConsumedBy'].apply(lambda x: len(x) == i - 1))) | ((df['SectorProducedBy'].apply(lambda x: len(x) == i - 1)) & (df['SectorConsumedBy'].apply(lambda x: len(x) == i - 1)) )].rename(columns={ 'SectorProducedBy': 'SPB', 'SectorConsumedBy': 'SCB' }) # merge the dfs merge_cols = [col for col in df2.columns if hasattr(df2[col], 'str')] # also drop activity and description cols merge_cols = [ c for c in merge_cols if c not in ['ActivityConsumedBy', 'ActivityProducedBy', 'Description'] ] if len(df2) > 0: dfm = df1.merge(df2[merge_cols], how='outer', on=merge_cols, indicator=True).query('_merge=="left_only"').drop( '_merge', axis=1) else: dfm = df1.copy(deep=True) if len(dfm) > 0: # replace the SCB and SPB columns then aggregate and add to df dfm['SectorProducedBy'] = dfm['SPB'] dfm['SectorConsumedBy'] = dfm['SCB'] dfm = dfm.drop(columns=(['SPB', 'SCB'])) # aggregate the new sector flow amounts agg_sectors = aggregator(dfm, group_cols) # append to df agg_sectors = replace_NoneType_with_empty_cells(agg_sectors) df = df.append(agg_sectors, sort=False).reset_index(drop=True) df = df.drop_duplicates() # if activities are source-like, set col values as # copies of the sector columns if sector_like_activities: df = df.assign(ActivityProducedBy=df['SectorProducedBy']) df = df.assign(ActivityConsumedBy=df['SectorConsumedBy']) # reindex columns df = df.reindex(df_load.columns, axis=1) # replace null values df = replace_strings_with_NoneType(df).reset_index(drop=True) return df