def sector_disaggregation_generalized(fbs, group_cols): """ function to disaggregate sectors if there is only one naics at a lower level works for lower than naics 4 :param df: A FBS df :return: A FBS df with missing naics5 and naics6 """ # load naics 2 to naics 6 crosswalk cw_load = load_sector_length_crosswalk_w_nonnaics() # for loop min length to 6 digits length = min(fbs['Sector'].apply(lambda x: len(x)).unique()) # appends missing naics levels to df for i in range(length, 6): sector_merge = 'NAICS_' + str(i) sector_add = 'NAICS_' + str(i+1) # subset the df by naics length cw = cw_load[[sector_merge, sector_add]] # only keep the rows where there is only one value in sector_add for a value in sector_merge cw = cw.drop_duplicates(subset=[sector_merge], keep=False).reset_index(drop=True) sector_list = cw[sector_merge].values.tolist() # subset df to sectors with length = i and length = i + 1 df_subset = fbs[fbs['Sector'].apply(lambda x: i + 1 >= len(x) >= i)] # create new columns that are length i df_subset = df_subset.assign(Sector_tmp=df_subset['Sector'].apply(lambda x: x[0:i])) # subset the df to the rows where the tmp sector columns are in naics list df_subset = df_subset.loc[df_subset['Sector_tmp'].isin(sector_list)] # drop all rows with duplicate temp values, as a less aggregated naics exists group_cols = [e for e in group_cols if e not in ('Sector')] group_cols.append('Sector_tmp') df_subset2 = df_subset.drop_duplicates(subset=group_cols, keep=False).reset_index(drop=True) # merge the naics cw new_naics = pd.merge(df_subset2, cw[[sector_merge, sector_add]], how='left', left_on=['Sector_tmp'], right_on=[sector_merge]) # add column counting the number of child naics associated with a parent new_naics = new_naics.assign(sector_count=new_naics.groupby(['Location', 'Sector_tmp'])['Sector_tmp'].transform('count')) # only keep the rows where the count is 1 new_naics2 = new_naics[new_naics['sector_count'] == 1] del new_naics2['sector_count'] # issue warning if rows with more than one child naics that get dropped - will need method of estimation missing_naics = new_naics[new_naics['sector_count'] > 1] if len(missing_naics) > 0: missing_naics = missing_naics[['Location', 'Sector']].values.tolist() log.warning('There is data at sector length ' + str(i) + ' that is lost at sector length ' + str(i+1) + ' for ' + str(missing_naics)) new_naics2 = new_naics2.rename(columns={sector_add: "ST"}) new_naics2 = new_naics2.drop(columns=[sector_merge]) # drop columns and rename new sector columns new_naics2 = new_naics2.drop(columns=["Sector", "Sector_tmp"]) new_naics2 = new_naics2.rename(columns={"ST": "Sector"}) # append new naics to df if len(new_naics2) > 1: fbs = pd.concat([fbs, new_naics2], sort=True) return fbs
def naics_expansion(facility_NAICS): """ modeled after sector_disaggregation in flowbyfunctions, updates NAICS to more granular sectors if there is only one naics at a lower level :param facility_NAICS: df of facilities from facility matcher with NAICS """ # load naics 2 to naics 6 crosswalk cw_load = load_sector_length_crosswalk_w_nonnaics() cw = cw_load[['NAICS_4', 'NAICS_5', 'NAICS_6']] # subset the naics 4 and 5 columns cw4 = cw_load[['NAICS_4', 'NAICS_5']] cw4 = cw4.drop_duplicates(subset=['NAICS_4'], keep=False).reset_index(drop=True) naics4 = cw4['NAICS_4'].values.tolist() # subset the naics 5 and 6 columns cw5 = cw_load[['NAICS_5', 'NAICS_6']] cw5 = cw5.drop_duplicates(subset=['NAICS_5'], keep=False).reset_index(drop=True) naics5 = cw5['NAICS_5'].values.tolist() # for loop in reverse order longest length naics minus 1 to 2 # appends missing naics levels to df for i in range(4, 6): if i == 4: sector_list = naics4 sector_merge = "NAICS_4" sector_add = "NAICS_5" elif i == 5: sector_list = naics5 sector_merge = "NAICS_5" sector_add = "NAICS_6" # subset df to NAICS with length = i df_subset = facility_NAICS.loc[facility_NAICS["NAICS"].apply( lambda x: len(x) == i)] # subset the df to the rows where the tmp sector columns are in naics list df_subset = df_subset.loc[(df_subset['NAICS'].isin(sector_list))] # merge the naics cw new_naics = pd.merge(df_subset, cw[[sector_merge, sector_add]], how='left', left_on=['NAICS'], right_on=[sector_merge]) # drop columns and rename new sector columns new_naics['NAICS'] = new_naics[sector_add] new_naics = new_naics.drop(columns=[sector_merge, sector_add]) # drop records with NAICS that have now been expanded facility_NAICS = facility_NAICS[~facility_NAICS['NAICS']. isin(sector_list)] # append new naics to df facility_NAICS = pd.concat([facility_NAICS, new_naics], sort=True) return facility_NAICS
def sector_disaggregation(sector_disaggregation, groupby_dict): """ function to disaggregate sectors if there is only one naics at a lower level works for lower than naics 4 :param df: A FBS df :return: A FBS df with missing naics5 and naics6 """ # todo: modify by adding count column - only want to keep the 1:1 parent:child relationship sector_disaggregation = clean_df(sector_disaggregation, groupby_dict, fbs_fill_na_dict) # ensure None values are not strings sector_disaggregation['SectorConsumedBy'] = sector_disaggregation['SectorConsumedBy'].replace({'None': ""}) sector_disaggregation['SectorProducedBy'] = sector_disaggregation['SectorProducedBy'].replace({'None': ""}) # load naics 2 to naics 6 crosswalk cw_load = load_sector_length_crosswalk_w_nonnaics() # for loop min length to 6 digits length = sector_disaggregation[[fbs_activity_fields[0], fbs_activity_fields[1]]].apply( lambda x: x.str.len()).min().min() # appends missing naics levels to df for i in range(length, 6): sector_merge = 'NAICS_' + str(i) sector_add = 'NAICS_' + str(i+1) # subset the df by naics length cw = cw_load[[sector_merge, sector_add]] # only keep the rows where there is only one value in sector_add for a value in sector_merge cw = cw.drop_duplicates(subset=[sector_merge], keep=False).reset_index(drop=True) sector_list = cw[sector_merge].values.tolist() # subset df to sectors with length = i and length = i + 1 df_subset = sector_disaggregation.loc[sector_disaggregation[fbs_activity_fields[0]].apply(lambda x: i + 1 >= len(x) >= i) | sector_disaggregation[fbs_activity_fields[1]].apply(lambda x: i + 1 >= len(x) >= i)] # create new columns that are length i df_subset = df_subset.assign(SectorProduced_tmp=df_subset[fbs_activity_fields[0]].apply(lambda x: x[0:i])) df_subset = df_subset.assign(SectorConsumed_tmp=df_subset[fbs_activity_fields[1]].apply(lambda x: x[0:i])) # subset the df to the rows where the tmp sector columns are in naics list df_subset_1 = df_subset.loc[(df_subset['SectorProduced_tmp'].isin(sector_list)) & (df_subset['SectorConsumed_tmp'] == "")] df_subset_2 = df_subset.loc[(df_subset['SectorProduced_tmp'] == "") & (df_subset['SectorConsumed_tmp'].isin(sector_list))] df_subset_3 = df_subset.loc[(df_subset['SectorProduced_tmp'].isin(sector_list)) & (df_subset['SectorConsumed_tmp'].isin(sector_list))] # concat existing dfs df_subset = pd.concat([df_subset_1, df_subset_2, df_subset_3], sort=False) # drop all rows with duplicate temp values, as a less aggregated naics exists df_subset = df_subset.drop_duplicates(subset=['Flowable', 'Context', 'Location', 'SectorProduced_tmp', 'SectorConsumed_tmp'], keep=False).reset_index(drop=True) # merge the naics cw new_naics = pd.merge(df_subset, cw[[sector_merge, sector_add]], how='left', left_on=['SectorProduced_tmp'], right_on=[sector_merge]) new_naics = new_naics.rename(columns={sector_add: "SPB"}) new_naics = new_naics.drop(columns=[sector_merge]) new_naics = pd.merge(new_naics, cw[[sector_merge, sector_add]], how='left', left_on=['SectorConsumed_tmp'], right_on=[sector_merge]) new_naics = new_naics.rename(columns={sector_add: "SCB"}) new_naics = new_naics.drop(columns=[sector_merge]) # drop columns and rename new sector columns new_naics = new_naics.drop(columns=["SectorProducedBy", "SectorConsumedBy", "SectorProduced_tmp", "SectorConsumed_tmp"]) new_naics = new_naics.rename(columns={"SPB": "SectorProducedBy", "SCB": "SectorConsumedBy"}) # append new naics to df new_naics['SectorConsumedBy'] = new_naics['SectorConsumedBy'].replace({'nan': ""}) new_naics['SectorProducedBy'] = new_naics['SectorProducedBy'].replace({'nan': ""}) sector_disaggregation = pd.concat([sector_disaggregation, new_naics], sort=True) # replace blank strings with None sector_disaggregation = sector_disaggregation.replace({'': None}) sector_disaggregation = sector_disaggregation.replace({np.nan: None}) return sector_disaggregation
def check_for_missing_sector_data(df, target_sector_level): """ Modeled after datachecks.py check_if_losing_sector_data Allocates flow amount equally across child NAICS when parent NAICS is not target_level :param df: :param target_sector_level: :return: """ from flowsa.flowbyfunctions import replace_NoneType_with_empty_cells, replace_strings_with_NoneType # temporarily replace null values with empty cells df = replace_NoneType_with_empty_cells(df) activity_field = "SectorProducedBy" rows_lost = pd.DataFrame() cw_load = load_sector_length_crosswalk_w_nonnaics() for i in range(3, sector_level_key[target_sector_level]): # create df of i length df_subset = df.loc[df[activity_field].apply(lambda x: len(x) == i)] # import cw and subset to current sector length and target sector length nlength = list(sector_level_key.keys())[list( sector_level_key.values()).index(i)] cw = cw_load[[nlength, target_sector_level]].drop_duplicates() # add column with counts cw['sector_count'] = cw.groupby(nlength)[nlength].transform('count') # merge df & replace sector produced columns df_x = pd.merge(df_subset, cw, how='left', left_on=[activity_field], right_on=[nlength]) df_x[activity_field] = df_x[target_sector_level] df_x = df_x.drop(columns=[nlength, target_sector_level]) # calculate new flow amounts, based on sector count, allocating equally to the new sector length codes df_x['FlowAmount'] = df_x['FlowAmount'] / df_x['sector_count'] df_x = df_x.drop(columns=['sector_count']) # replace null values with empty cells df_x = replace_NoneType_with_empty_cells(df_x) # append to df sector_list = df_subset[activity_field].drop_duplicates() if len(df_x) != 0: log.warning('Data found at ' + str(i) + ' digit NAICS to be allocated' ': {}'.format(' '.join(map(str, sector_list)))) rows_lost = rows_lost.append(df_x, ignore_index=True, sort=True) if len(rows_lost) == 0: log.info('No data loss from NAICS in dataframe') else: log.info('Allocating FlowAmounts equally to each ' + target_sector_level) # add rows of missing data to the fbs sector subset df_allocated = pd.concat([df, rows_lost], ignore_index=True, sort=True) df_allocated = df_allocated.loc[df_allocated[activity_field].apply( lambda x: len(x) == sector_level_key[target_sector_level])] df_allocated.reset_index(inplace=True) # replace empty cells with NoneType (if dtype is object) df_allocated = replace_strings_with_NoneType(df_allocated) return df_allocated
def check_if_losing_sector_data(df, df_subset, target_sector_level): """ Determine rows of data that will be lost if subset data at target sector level In some instances, not all :param fbs: :return: """ df = df.fillna(fbs_fill_na_dict) # exclude nonsectors df = df.replace({'nan': '', 'None': ''}) rows_lost = pd.DataFrame() for i in range(2, sector_level_key[target_sector_level]): # create df of i length df_x1 = df.loc[ (df[fbs_activity_fields[0]].apply(lambda x: len(x) == i)) & (df[fbs_activity_fields[1]] == '')] df_x2 = df.loc[(df[fbs_activity_fields[0]] == '') & ( df[fbs_activity_fields[1]].apply(lambda x: len(x) == i))] df_x3 = df.loc[ (df[fbs_activity_fields[0]].apply(lambda x: len(x) == i)) & (df[fbs_activity_fields[1]].apply(lambda x: len(x) == i))] df_x = pd.concat([df_x1, df_x2, df_x3], ignore_index=True, sort=False) # create df of i + 1 length df_y1 = df.loc[ df[fbs_activity_fields[0]].apply(lambda x: len(x) == i + 1) | df[fbs_activity_fields[1]].apply(lambda x: len(x) == i + 1)] df_y2 = df.loc[ df[fbs_activity_fields[0]].apply(lambda x: len(x) == i + 1) & df[fbs_activity_fields[1]].apply(lambda x: len(x) == i + 1)] df_y = pd.concat([df_y1, df_y2], ignore_index=True, sort=False) # create temp sector columns in df y, that are i digits in length df_y.loc[:, 'spb_tmp'] = df_y[fbs_activity_fields[0]].apply( lambda x: x[0:i]) df_y.loc[:, 'scb_tmp'] = df_y[fbs_activity_fields[1]].apply( lambda x: x[0:i]) # don't modify household sector lengths df_y = df_y.replace({'F0': 'F010', 'F01': 'F010'}) # merge the two dfs df_m = pd.merge(df_x, df_y[[ 'Class', 'Context', 'FlowType', 'Flowable', 'Location', 'LocationSystem', 'Unit', 'Year', 'spb_tmp', 'scb_tmp' ]], how='left', left_on=[ 'Class', 'Context', 'FlowType', 'Flowable', 'Location', 'LocationSystem', 'Unit', 'Year', 'SectorProducedBy', 'SectorConsumedBy' ], right_on=[ 'Class', 'Context', 'FlowType', 'Flowable', 'Location', 'LocationSystem', 'Unit', 'Year', 'spb_tmp', 'scb_tmp' ]) # extract the rows that are not disaggregated to more specific naics rl = df_m[(df_m['scb_tmp'].isnull()) & (df_m['spb_tmp'].isnull())] # clean df rl = clean_df(rl, flow_by_sector_fields, fbs_fill_na_dict) rl_list = rl[['SectorProducedBy', 'SectorConsumedBy']].drop_duplicates().values.tolist() # match sectors with target sector length sectors # import cw and subset to current sector length and target sector length cw_load = load_sector_length_crosswalk_w_nonnaics() nlength = list(sector_level_key.keys())[list( sector_level_key.values()).index(i)] cw = cw_load[[nlength, target_sector_level]].drop_duplicates() # add column with counts cw['sector_count'] = cw.groupby(nlength)[nlength].transform('count') # merge df & conditionally replace sector produced/consumed columns rl_m = pd.merge(rl, cw, how='left', left_on=[fbs_activity_fields[0]], right_on=[nlength]) rl_m.loc[rl_m[fbs_activity_fields[0]] != '', fbs_activity_fields[0]] = rl_m[target_sector_level] rl_m = rl_m.drop(columns=[nlength, target_sector_level]) rl_m2 = pd.merge(rl_m, cw, how='left', left_on=[fbs_activity_fields[1]], right_on=[nlength]) rl_m2.loc[rl_m2[fbs_activity_fields[1]] != '', fbs_activity_fields[1]] = rl_m2[target_sector_level] rl_m2 = rl_m2.drop(columns=[nlength, target_sector_level]) # create one sector count column rl_m2['sector_count_x'] = rl_m2['sector_count_x'].fillna( rl_m2['sector_count_y']) rl_m3 = rl_m2.rename(columns={'sector_count_x': 'sector_count'}) rl_m3 = rl_m3.drop(columns=['sector_count_y']) # calculate new flow amounts, based on sector count, allocating equally to the new sector length codes rl_m3['FlowAmount'] = rl_m3['FlowAmount'] / rl_m3['sector_count'] rl_m3 = rl_m3.drop(columns=['sector_count']) # append to df if len(rl) != 0: log.warning('Data found at ' + str(i) + ' digit NAICS not represented in current ' 'data subset: {}'.format(' '.join(map(str, rl_list)))) rows_lost = rows_lost.append(rl_m3, ignore_index=True, sort=True) if len(rows_lost) == 0: log.info( 'No data loss from subsetting the dataframe by specified sector length' ) else: log.info('Allocating FlowAmounts equally to each ' + target_sector_level + ' associated with the sectors previously being dropped') # add rows of missing data to the fbs sector subset df_w_lost_data = pd.concat([df_subset, rows_lost], ignore_index=True, sort=True) df_w_lost_data = df_w_lost_data.replace({'': None}) return df_w_lost_data