def write_naics_2012_crosswalk(): """ Create a NAICS 2 - 6 digit crosswalk :return: """ # load the useeior mastercrosswalk subset to the naics timeseries cw_load = load_crosswalk('sector_timeseries') # load BEA codes that will act as NAICS house = load_crosswalk('household') govt = load_crosswalk('government') bea = pd.concat([house, govt], ignore_index=True).rename( columns={'Code': 'NAICS_2012_Code', 'NAICS_Level_to_Use_For': 'secLength'}) bea = bea[['NAICS_2012_Code', 'secLength']] # extract naics 2012 code column and drop duplicates and empty cells cw = cw_load[['NAICS_2012_Code']].drop_duplicates() cw = replace_NoneType_with_empty_cells(cw) cw = cw[cw['NAICS_2012_Code'] != ''] # also drop the existing household and government codes because not all # inclusive and does not conform to NAICS length standards cw = cw[~cw['NAICS_2012_Code'].str.startswith( tuple(['F0', 'S0']))].reset_index(drop=True) # add column of sector length cw['secLength'] = cw['NAICS_2012_Code'].apply( lambda x: f"NAICS_{str(len(x))}") # add bea codes subbing for NAICS cw2 = pd.concat([cw, bea], ignore_index=True) # create dictionary of dataframes d = dict(tuple(cw2.groupby('secLength'))) for l in range(2, 9): d[f'NAICS_{l}'] = d[f'NAICS_{l}'][['NAICS_2012_Code']].reset_index( drop=True).rename( columns={'NAICS_2012_Code': f'NAICS_{l}'}) naics_cw = d['NAICS_2'] for l in range(3, 7): naics_cw = (d[f'NAICS_{l}'].assign(temp=d[f'NAICS_{l}'][ f'NAICS_{l}'].str.extract( pat=f"({'|'.join(naics_cw[f'NAICS_{l-1}'])})")).merge( naics_cw, how='right', left_on='temp', right_on=f'NAICS_{l-1}', suffixes=['', '_y'])).drop(columns=['temp']) # reorder naics_cw = naics_cw.reindex(sorted(naics_cw.columns), axis=1) # save as csv naics_cw.to_csv(datapath + "NAICS_2012_Crosswalk.csv", index=False)
def addSectorNames(df): """ Add column to an FBS df with the sector names :param df: FBS df with singular "Sector" column :return: FBS df with new column of combined Sector and SectorNames """ # load crosswalk and add names cw = load_crosswalk('sector_name') cw['SectorName'] = cw['NAICS_2012_Code'].map( str) + ' (' + cw['NAICS_2012_Name'] + ')' cw = cw.rename(columns={'NAICS_2012_Code': 'Sector'}) df = df.merge(cw[['Sector', 'SectorName']], how='left') df = df.reset_index(drop=True) return df
def check_if_activities_match_sectors(fba): """ Checks if activities in flowbyactivity that appear to be like sectors are actually sectors :param fba: a flow by activity dataset :return: A list of activities not marching the default sector list or text indicating 100% match """ # Get list of activities in a flowbyactivity file activities = [] for f in fba_activity_fields: activities.extend(fba[f]) # Get list of module default sectors flowsa_sector_list = list( load_crosswalk('sector_timeseries')[SECTOR_SOURCE_NAME]) activities_missing_sectors = set(activities) - set(flowsa_sector_list) if len(activities_missing_sectors) > 0: vLog.debug("%s activities not matching sectors in default %s list", str(len(activities_missing_sectors)), SECTOR_SOURCE_NAME) return activities_missing_sectors
def assign_naics(df_load): """ Function to assign NAICS codes to each dataframe activity :param df_load: df, a FlowByActivity subset that contains unique activity names :return: df with assigned Sector columns """ cw_load = load_crosswalk('BEA') cw = cw_load[['BEA_2012_Detail_Code', 'NAICS_2012_Code']].drop_duplicates().reset_index(drop=True) # drop all rows with naics >6 cw = cw[cw['NAICS_2012_Code'].apply( lambda x: len(str(x)) == 6)].reset_index(drop=True) df = pd.merge(df_load, cw, left_on='Activity', right_on='BEA_2012_Detail_Code') df = df.drop(columns=["BEA_2012_Detail_Code"]) df = df.rename(columns={"NAICS_2012_Code": "Sector"}) df['SectorSourceName'] = 'NAICS_2012_Code' return df
def melt_naics_crosswalk(): """ Create a melt version of the naics 07 to 17 crosswalk to map naics to naics 2012 :return: df, naics crosswalk melted """ # load the mastercroswalk and subset by sectorsourcename, # save values to list cw_load = load_crosswalk('sector_timeseries') # create melt table of possible 2007 and 2017 naics that can # be mapped to 2012 cw_melt = cw_load.melt(id_vars='NAICS_2012_Code', var_name='NAICS_year', value_name='NAICS') # drop the naics year because not relevant for replacement purposes cw_replacement = cw_melt.dropna(how='any') cw_replacement = cw_replacement[['NAICS_2012_Code', 'NAICS']].drop_duplicates() # drop rows where contents are equal cw_replacement = cw_replacement[ cw_replacement['NAICS_2012_Code'] != cw_replacement['NAICS']] # drop rows where length > 6 cw_replacement = cw_replacement[cw_replacement['NAICS_2012_Code'].apply( lambda x: len(x) < 7)].reset_index(drop=True) # order by naics 2012 cw_replacement = cw_replacement.sort_values(['NAICS', 'NAICS_2012_Code' ]).reset_index(drop=True) # create allocation ratios by determining number of # NAICS 2012 to other naics when not a 1:1 ratio cw_replacement_2 = cw_replacement.assign( naics_count=cw_replacement.groupby( ['NAICS'])['NAICS_2012_Code'].transform('count')) cw_replacement_2 = cw_replacement_2.assign(allocation_ratio=1 / cw_replacement_2['naics_count']) return cw_replacement_2
def allocate_usda_ers_mlu_other_land(df, attr, fbs_list): """ From the USDA ERS MLU 2012 report: "Includes miscellaneous other uses, such as industrial and commercial sites in rural areas, cemeteries, golf courses, mining areas, quarry sites, marshes, swamps, sand dunes, bare rocks, deserts, tundra, rural residential, and other unclassified land. In this report, urban land is reported as a separate category." Mining data is calculated using a separate source = BLM PLS. Want to extract rural residential land area from total value of 'Other Land' :param df: df, USDA ERA MLU Land :param attr: dictionary, attribute data from method yaml for activity set :param fbs_list: list, FBS dfs for activities created prior to the activity set that calls on this fxn :return: df, allocated USDS ERS MLU Land, FBS format """ # land in rural residential lots rural_res = get_area_of_rural_land_occupied_by_houses_2013() # household codes household = load_crosswalk('household') household = household['Code'].drop_duplicates().tolist() # in df, where sector is a personal expenditure value, and # location = 00000, replace with rural res value vLogDetailed.info('The only category for MLU other land use is rural land ' 'occupation. All other land area in this category is ' 'unassigned to sectors, resulting in unaccounted land ' 'area.') df['FlowAmount'] = np.where(df['SectorConsumedBy'].isin(household), rural_res, df['FlowAmount']) return df
def replace_naics_w_naics_from_another_year(df_load, sectorsourcename): """ Replace any non sectors with sectors. :param df_load: df with sector columns or sector-like activities :param sectorsourcename: str, sector source name (ex. NAICS_2012_Code) :return: df, with non-sectors replaced with sectors """ # from flowsa.flowbyfunctions import aggregator # drop NoneType df = replace_NoneType_with_empty_cells(df_load).reset_index(drop=True) # load the mastercroswalk and subset by sectorsourcename, # save values to list cw_load = load_crosswalk('sector_timeseries') cw = cw_load[sectorsourcename].drop_duplicates().tolist() # load melted crosswalk cw_melt = melt_naics_crosswalk() # drop the count column cw_melt = cw_melt.drop(columns='naics_count') # determine which headers are in the df if 'SectorConsumedBy' in df: column_headers = ['SectorProducedBy', 'SectorConsumedBy'] else: column_headers = ['ActivityProducedBy', 'ActivityConsumedBy'] # check if there are any sectors that are not in the naics 2012 crosswalk non_naics = check_if_sectors_are_naics(df, cw, column_headers) # loop through the df headers and determine if value is # not in crosswalk list if len(non_naics) != 0: vLog.debug( 'Checking if sectors represent a different ' 'NAICS year, if so, replace with %s', sectorsourcename) for c in column_headers: # merge df with the melted sector crosswalk df = df.merge(cw_melt, left_on=c, right_on='NAICS', how='left') # if there is a value in the sectorsourcename column, # use that value to replace sector in column c if value in # column c is in the non_naics list df[c] = np.where((df[c] == df['NAICS']) & (df[c].isin(non_naics)), df[sectorsourcename], df[c]) # multiply the FlowAmount col by allocation_ratio df.loc[df[c] == df[sectorsourcename], 'FlowAmount'] = df['FlowAmount'] * df['allocation_ratio'] # drop columns df = df.drop( columns=[sectorsourcename, 'NAICS', 'allocation_ratio']) vLog.debug('Replaced NAICS with %s', sectorsourcename) # check if there are any sectors that are not in # the naics 2012 crosswalk vLog.debug('Check again for non NAICS 2012 Codes') nonsectors = check_if_sectors_are_naics(df, cw, column_headers) if len(nonsectors) != 0: vLog.debug('Dropping non-NAICS from dataframe') for c in column_headers: # drop rows where column value is in the nonnaics list df = df[~df[c].isin(nonsectors)] # aggregate data possible_column_headers = \ ('FlowAmount', 'Spread', 'Min', 'Max', 'DataReliability', 'TemporalCorrelation', 'GeographicalCorrelation', 'TechnologicalCorrelation', 'DataCollection', 'Description') # list of column headers to group aggregation by groupby_cols = [ e for e in df.columns.values.tolist() if e not in possible_column_headers ] df = aggregator(df, groupby_cols) # drop rows where both SectorConsumedBy and SectorProducedBy NoneType if 'SectorConsumedBy' in df: df_drop = df[(df['SectorConsumedBy'].isnull()) & (df['SectorProducedBy'].isnull())] if len(df_drop) != 0: activities_dropped = pd.unique( df_drop[['ActivityConsumedBy', 'ActivityProducedBy']].values.ravel('K')) activities_dropped = list( filter(lambda x: x is not None, activities_dropped)) vLog.debug('Dropping rows where the Activity columns contain %s', ', '.join(activities_dropped)) df = df[~((df['SectorConsumedBy'].isnull()) & (df['SectorProducedBy'].isnull()))].reset_index(drop=True) else: df = df[~((df['ActivityConsumedBy'].isnull()) & (df['ActivityProducedBy'].isnull()))].reset_index(drop=True) df = replace_strings_with_NoneType(df) return df
# write_Crosswalk_BEA_2012_Detail.py (scripts) # !/usr/bin/env python3 # coding=utf-8 """ Create a crosswalk linking BEA to NAICS for 2012 Detail """ from flowsa.common import load_crosswalk from flowsa.settings import datapath if __name__ == '__main__': cw_load = load_crosswalk('BEA') cw = cw_load[['BEA_2012_Detail_Code', 'NAICS_2012_Code']].drop_duplicates().reset_index(drop=True) # drop all rows with naics >6 cw = cw[cw['NAICS_2012_Code'].apply(lambda x: len(str(x)) == 6)].reset_index(drop=True) df = cw.rename(columns={"NAICS_2012_Code": "Sector", "BEA_2012_Detail_Code":"Activity"}) df['SectorSourceName'] = 'NAICS_2012_Code' df['ActivitySourceName'] = 'BEA_2012_Detail_Code' df.dropna(subset=["Sector"], inplace=True) # assign sector type df['SectorType'] = None # sort df df = df.sort_values('Sector') # reset index df.reset_index(drop=True, inplace=True) # set order
def update_naics_crosswalk(): """ update the useeior crosswalk with crosswalks created for flowsa datasets - want to add any NAICS > 6 digits Add NAICS 2002 :return: df of NAICS that include any unofficial NAICS """ # read useeior master crosswalk, subset NAICS columns naics_load = load_crosswalk('BEA') naics = naics_load[['NAICS_2007_Code', 'NAICS_2012_Code', 'NAICS_2017_Code' ]].drop_duplicates().reset_index(drop=True) # convert all rows to string naics = naics.astype(str) # ensure all None are NoneType naics = replace_strings_with_NoneType(naics) # drop rows where all None naics = naics.dropna(how='all') # drop naics > 6 in mastercrosswalk (all manufacturing) because unused # and slows functions naics = naics[naics['NAICS_2012_Code'].apply( lambda x: len(x) < 7)].reset_index(drop=True) # find any NAICS where length > 6 that are used for allocation purposes # and add to naics list missing_naics_df_list = [] # read in all the crosswalk csv files (ends in toNAICS.csv) for file_name in glob.glob( datapath + "activitytosectormapping/" + 'NAICS_Crosswalk_*.csv'): # skip Statistics Canada GDP because not all sectors relevant if file_name != crosswalkpath + 'Crosswalk_StatCan_GDP_toNAICS.csv': df = pd.read_csv(file_name, low_memory=False, dtype=str) # convert all rows to string df = df.astype(str) # determine sector year naics_year = df['SectorSourceName'][0] if naics_year == 'nan': log.info(f'Missing SectorSourceName for {file_name}') continue # subset dataframe so only sector df = df[['Sector']] # trim whitespace and cast as string, rename column df['Sector'] = df['Sector'].astype(str).str.strip() df = df.rename(columns={'Sector': naics_year}) # extract sector year column from master crosswalk df_naics = naics[[naics_year]] # find any NAICS that are in source crosswalk but not in # mastercrosswalk common = df.merge(df_naics, on=[naics_year, naics_year]) missing_naics = df[(~df[naics_year].isin(common[naics_year]))] # extract sectors where len > 6 and that does not include a '-' missing_naics = missing_naics[missing_naics[naics_year].apply( lambda x: len(x) > 6)] if len(missing_naics) != 0: missing_naics = missing_naics[ ~missing_naics[naics_year].str.contains('-')] # append to df list missing_naics_df_list.append(missing_naics) # concat df list and drop duplications missing_naics_df = \ pd.concat(missing_naics_df_list, ignore_index=True, sort=False).drop_duplicates().reset_index(drop=True) # sort df missing_naics_df = missing_naics_df.sort_values(['NAICS_2012_Code']) missing_naics_df = missing_naics_df.reset_index(drop=True) # add missing naics to master naics crosswalk total_naics = naics.append(missing_naics_df, ignore_index=True) # sort df total_naics = total_naics.sort_values( ['NAICS_2012_Code', 'NAICS_2007_Code']).drop_duplicates() total_naics = total_naics[~total_naics['NAICS_2012_Code'].isin( ['None', 'unknown', 'nan', 'Unknown', np.nan])].reset_index(drop=True) # convert all columns to string total_naics = total_naics.astype(str) # add naics 2002 naics_02 = load_naics_02_to_07_crosswalk() naics_cw = pd.merge(total_naics, naics_02, how='left') # ensure NoneType naics_cw = replace_strings_with_NoneType(naics_cw) # reorder naics_cw = naics_cw[['NAICS_2002_Code', 'NAICS_2007_Code', 'NAICS_2012_Code', 'NAICS_2017_Code']] # save as csv naics_cw.to_csv(datapath + "NAICS_Crosswalk_TimeSeries.csv", index=False)
def check_for_missing_sector_data(df, target_sector_level): """ Modeled after validation.py check_if_losing_sector_data Allocates flow amount equally across child NAICS when parent NAICS is not target_level :param df: df :param target_sector_level: str, final sector level of FBS (ex. NAICS_6) :return: df with missing sector level data """ from flowsa.dataclean import replace_NoneType_with_empty_cells from flowsa.dataclean import replace_strings_with_NoneType # temporarily replace null values with empty cells df = replace_NoneType_with_empty_cells(df) activity_field = "SectorProducedBy" rows_lost = pd.DataFrame() cw_load = load_crosswalk('sector_length') for i in range(3, sector_level_key[target_sector_level]): # create df of i length df_subset = df.loc[df[activity_field].apply(lambda x: len(x) == i)] # import cw and subset to current sector length and # target sector length nlength = list(sector_level_key.keys())[list( sector_level_key.values()).index(i)] cw = cw_load[[nlength, target_sector_level]].drop_duplicates() # add column with counts cw['sector_count'] = cw.groupby(nlength)[nlength].transform('count') # merge df & replace sector produced columns df_x = pd.merge(df_subset, cw, how='left', left_on=[activity_field], right_on=[nlength]) df_x[activity_field] = df_x[target_sector_level] df_x = df_x.drop(columns=[nlength, target_sector_level]) # calculate new flow amounts, based on sector count, # allocating equally to the new sector length codes df_x['FlowAmount'] = df_x['FlowAmount'] / df_x['sector_count'] df_x = df_x.drop(columns=['sector_count']) # replace null values with empty cells df_x = replace_NoneType_with_empty_cells(df_x) # append to df sector_list = df_subset[activity_field].drop_duplicates() if len(df_x) != 0: log.warning( 'Data found at %s digit NAICS to be allocated: ' '{}'.format(' '.join(map(str, sector_list))), str(i)) rows_lost = rows_lost.append(df_x, ignore_index=True, sort=True) if len(rows_lost) == 0: log.info('No data loss from NAICS in dataframe') else: log.info('Allocating FlowAmounts equally to each %s', target_sector_level) # add rows of missing data to the fbs sector subset df_allocated = pd.concat([df, rows_lost], ignore_index=True, sort=True) df_allocated = df_allocated.loc[df_allocated[activity_field].apply( lambda x: len(x) == sector_level_key[target_sector_level])] df_allocated.reset_index(inplace=True) # replace empty cells with NoneType (if dtype is object) df_allocated = replace_strings_with_NoneType(df_allocated) return df_allocated
def naics_expansion(facility_NAICS): """ modeled after sector_disaggregation in flowbyfunctions, updates NAICS to more granular sectors if there is only one naics at a lower level :param facility_NAICS: df of facilities from facility matcher with NAICS :return: df """ # load naics 2 to naics 6 crosswalk cw_load = load_crosswalk('sector_length') cw = cw_load[['NAICS_4', 'NAICS_5', 'NAICS_6']] # subset the naics 4 and 5 columns cw4 = cw_load[['NAICS_4', 'NAICS_5']] cw4 = cw4.drop_duplicates(subset=['NAICS_4'], keep=False).reset_index(drop=True) naics4 = cw4['NAICS_4'].values.tolist() # subset the naics 5 and 6 columns cw5 = cw_load[['NAICS_5', 'NAICS_6']] cw5 = cw5.drop_duplicates(subset=['NAICS_5'], keep=False).reset_index(drop=True) naics5 = cw5['NAICS_5'].values.tolist() # for loop in reverse order longest length naics minus 1 to 2 # appends missing naics levels to df for i in range(4, 6): if i == 4: sector_list = naics4 sector_merge = "NAICS_4" sector_add = "NAICS_5" elif i == 5: sector_list = naics5 sector_merge = "NAICS_5" sector_add = "NAICS_6" # subset df to NAICS with length = i df_subset = facility_NAICS.loc[facility_NAICS["NAICS"].apply( lambda x: len(x) == i)] # subset the df to the rows where the tmp sector columns are # in naics list df_subset = df_subset.loc[(df_subset['NAICS'].isin(sector_list))] # merge the naics cw new_naics = pd.merge(df_subset, cw[[sector_merge, sector_add]], how='left', left_on=['NAICS'], right_on=[sector_merge]) # drop columns and rename new sector columns new_naics['NAICS'] = new_naics[sector_add] new_naics = new_naics.drop(columns=[sector_merge, sector_add]) # drop records with NAICS that have now been expanded facility_NAICS = facility_NAICS[~facility_NAICS['NAICS']. isin(sector_list)] # append new naics to df facility_NAICS = pd.concat([facility_NAICS, new_naics], sort=True) return facility_NAICS
def assign_naics(df_load): """ Function to assign NAICS codes to each dataframe activity :param df: df, a FlowByActivity subset that contains unique activity names :return: df with assigned Sector columns """ cw_load = load_crosswalk('BEA') cw = cw_load[['BEA_2012_Detail_Code', 'NAICS_2012_Code']].drop_duplicates().reset_index(drop=True) # least aggregate level that applies is 5 digits cw = cw[cw['NAICS_2012_Code'].apply( lambda x: len(str(x)) == 6)].reset_index(drop=True) cw = cw.sort_values(['BEA_2012_Detail_Code', 'NAICS_2012_Code']) df = pd.merge(df_load, cw, left_on='Activity', right_on='BEA_2012_Detail_Code', how='left') df = df.drop(columns=["BEA_2012_Detail_Code"]) df = df.rename(columns={"NAICS_2012_Code": "Sector"}) # reset sector value for sand, gravel, clay df.loc[df['Activity'] == '212320', 'Sector'] = '212321' df = df.append(pd.DataFrame( [['Blackhurst_IO', '212320', '212322']], columns=['ActivitySourceName', 'Activity', 'Sector']), ignore_index=True, sort=True) df = df.append(pd.DataFrame( [['Blackhurst_IO', '212320', '212324']], columns=['ActivitySourceName', 'Activity', 'Sector']), ignore_index=True, sort=True) df = df.append(pd.DataFrame( [['Blackhurst_IO', '212320', '212325']], columns=['ActivitySourceName', 'Activity', 'Sector']), ignore_index=True, sort=True) df.loc[df['Activity'] == '212390', 'Sector'] = '212391' df = df.append(pd.DataFrame( [['Blackhurst_IO', '212390', '212392']], columns=['ActivitySourceName', 'Activity', 'Sector']), ignore_index=True, sort=True) df = df.append(pd.DataFrame( [['Blackhurst_IO', '212390', '212393']], columns=['ActivitySourceName', 'Activity', 'Sector']), ignore_index=True, sort=True) df = df.append(pd.DataFrame( [['Blackhurst_IO', '212390', '212399']], columns=['ActivitySourceName', 'Activity', 'Sector']), ignore_index=True, sort=True) # drop two rows where Blackhurst's IO vectors do not align with the # NAICS to BEA mapping because a NAICS code is it's own activity rather # than a subset of an activity # 'iron ore mining' is it's own row df = df[~((df['Activity'] == '2122A0') & (df['Sector'] == '212210'))] # 'support activities for oil and gas operations" is its own row df = df[~((df['Activity'] == '21311A') & (df['Sector'] == '213112'))] # ensure the sector column has value for iron ore and support activities df.loc[df['Activity'] == '212210', 'Sector'] = '212210' df.loc[df['Activity'] == '213112', 'Sector'] = '213112' df['SectorSourceName'] = 'NAICS_2012_Code' return df
def equally_allocate_parent_to_child_naics(df_load, target_sector_level): """ Determine rows of data that will be lost if subset data at target sector level. Equally allocate parent NAICS to child NAICS where child NAICS missing :param df_load: df, FBS format :param target_sector_level: str, target NAICS level for FBS output :return: df, with all child NAICS at target sector level """ # exclude nonsectors df = replace_NoneType_with_empty_cells(df_load) rows_lost = pd.DataFrame() for i in range(2, sector_level_key[target_sector_level]): # create df of i length df_x1 = \ df.loc[(df[fbs_activity_fields[0]].apply(lambda x: len(x) == i)) & (df[fbs_activity_fields[1]] == '')] df_x2 = \ df.loc[(df[fbs_activity_fields[0]] == '') & (df[fbs_activity_fields[1]].apply(lambda x: len(x) == i))] df_x3 = \ df.loc[(df[fbs_activity_fields[0]].apply(lambda x: len(x) == i)) & (df[fbs_activity_fields[1]].apply(lambda x: len(x) == i))] df_x = pd.concat([df_x1, df_x2, df_x3], ignore_index=True, sort=False) if len(df_x) > 0: # create df of i + 1 length df_y1 = df.loc[ df[fbs_activity_fields[0]].apply(lambda x: len(x) == i + 1) | df[fbs_activity_fields[1]].apply(lambda x: len(x) == i + 1)] df_y2 = df.loc[ df[fbs_activity_fields[0]].apply(lambda x: len(x) == i + 1) & df[fbs_activity_fields[1]].apply(lambda x: len(x) == i + 1)] df_y = pd.concat([df_y1, df_y2], ignore_index=True, sort=False) # create temp sector columns in df y, that are i digits in length df_y.loc[:, 'spb_tmp'] = \ df_y[fbs_activity_fields[0]].apply(lambda x: x[0:i]) df_y.loc[:, 'scb_tmp'] = \ df_y[fbs_activity_fields[1]].apply(lambda x: x[0:i]) # don't modify household sector lengths or gov't transport df_y = df_y.replace({'F0': 'F010', 'F01': 'F010'}) # merge the two dfs if 'Context' in df_y.columns: merge_cols = [ 'Class', 'Context', 'FlowType', 'Flowable', 'Location', 'LocationSystem', 'Unit', 'Year' ] else: merge_cols = [ 'Class', 'FlowType', 'Location', 'LocationSystem', 'Unit', 'Year' ] df_m = pd.merge(df_x, df_y[merge_cols + ['spb_tmp', 'scb_tmp']], how='left', left_on=merge_cols + ['SectorProducedBy', 'SectorConsumedBy'], right_on=merge_cols + ['spb_tmp', 'scb_tmp']) # extract the rows that are not disaggregated to more # specific naics rl = df_m[(df_m['scb_tmp'].isnull()) & (df_m['spb_tmp'].isnull())].reset_index(drop=True) # clean df rl = replace_strings_with_NoneType(rl) rl_list = rl[['SectorProducedBy', 'SectorConsumedBy']]\ .drop_duplicates().values.tolist() # match sectors with target sector length sectors # import cw and subset to current sector length # and target sector length cw_load = load_crosswalk('sector_length') nlength = list(sector_level_key.keys())[list( sector_level_key.values()).index(i)] cw = cw_load[[nlength, target_sector_level]].drop_duplicates() # add column with counts cw['sector_count'] = \ cw.groupby(nlength)[nlength].transform('count') # merge df & conditionally replace sector produced/consumed columns rl_m = pd.merge(rl, cw, how='left', left_on=[fbs_activity_fields[0]], right_on=[nlength]) rl_m.loc[rl_m[fbs_activity_fields[0]] != '', fbs_activity_fields[0]] = rl_m[target_sector_level] rl_m = rl_m.drop(columns=[nlength, target_sector_level]) rl_m2 = pd.merge(rl_m, cw, how='left', left_on=[fbs_activity_fields[1]], right_on=[nlength]) rl_m2.loc[rl_m2[fbs_activity_fields[1]] != '', fbs_activity_fields[1]] = rl_m2[target_sector_level] rl_m2 = rl_m2.drop(columns=[nlength, target_sector_level]) # create one sector count column rl_m2['sector_count_x'] = \ rl_m2['sector_count_x'].fillna(rl_m2['sector_count_y']) rl_m3 = rl_m2.rename(columns={'sector_count_x': 'sector_count'}) rl_m3 = rl_m3.drop(columns=['sector_count_y']) # calculate new flow amounts, based on sector count, # allocating equally to the new sector length codes rl_m3['FlowAmount'] = rl_m3['FlowAmount'] / rl_m3['sector_count'] rl_m3 = rl_m3.drop(columns=['sector_count', 'spb_tmp', 'scb_tmp']) # append to df if len(rl) != 0: vLogDetailed.warning( 'Data found at %s digit NAICS not ' 'represented in current data subset: ' '{}'.format(' '.join(map(str, rl_list))), str(i)) rows_lost = rows_lost.append(rl_m3, ignore_index=True) if len(rows_lost) != 0: vLogDetailed.info( 'Allocating FlowAmounts equally to ' 'each %s associated with the sectors previously ' 'dropped', target_sector_level) # add rows of missing data to the fbs sector subset df_w_lost_data = pd.concat([df, rows_lost], ignore_index=True, sort=True) df_w_lost_data = replace_strings_with_NoneType(df_w_lost_data) return df_w_lost_data
def equally_allocate_suppressed_parent_to_child_naics(df_load, sector_column, groupcols): """ Estimate data suppression, by equally allocating parent NAICS values to child NAICS :param df_load: df with sector columns :param sector_column: str, column to estimate suppressed data for :param groupcols: list, columns to group df by :return: df, with estimated suppressed data """ df = sector_disaggregation(df_load) df = replace_NoneType_with_empty_cells(df) df = df[df[sector_column] != ''] # determine if activities are sector-like, # if aggregating a df with a 'SourceName' sector_like_activities = False if 'SourceName' in df_load.columns: s = pd.unique(df_load['SourceName'])[0] sector_like_activities = check_activities_sector_like(s) # if activities are source like, drop from df, # add back in as copies of sector columns columns to keep if sector_like_activities: # subset df df_cols = [ e for e in df.columns if e not in ('ActivityProducedBy', 'ActivityConsumedBy') ] df = df[df_cols] # drop activity from groupby groupcols = [ e for e in groupcols if e not in ['ActivityConsumedBy', 'ActivityProducedBy', 'Description'] ] # load naics 2 to naics 6 crosswalk cw_load = load_crosswalk('sector_length') cw_melt = cw_load.melt(id_vars=["NAICS_6"], var_name="NAICS_Length", value_name="NAICS_Match").drop( columns=['NAICS_Length']).drop_duplicates() df_sup = df[df['FlowAmount'] == 0].reset_index(drop=True) # merge the naics cw new_naics = pd.merge(df_sup, cw_melt, how='left', left_on=[sector_column], right_on=['NAICS_Match']) # drop rows where match is null because no additional naics to add new_naics = new_naics.dropna() new_naics[sector_column] = new_naics['NAICS_6'].copy() new_naics = new_naics.drop(columns=['NAICS_6', 'NAICS_Match']) # merge the new naics with the existing df, if data already # existed for a NAICS6, keep the original dfm = pd.merge(new_naics[groupcols], df, how='left', on=groupcols, indicator=True).query('_merge=="left_only"').drop('_merge', axis=1) dfm = replace_NoneType_with_empty_cells(dfm) dfm = dfm.fillna(0) df = pd.concat([df, dfm], sort=True, ignore_index=True) # add length column and subset the data # subtract out existing data at NAICS6 from total data # at a length where no suppressed data df = df.assign(secLength=df[sector_column].apply(lambda x: len(x))) # add column for each state of sector length where # there are no missing values df_sup = df_sup.assign( secLength=df_sup[sector_column].apply(lambda x: len(x))) df_sup2 = (df_sup.groupby( ['FlowName', 'Compartment', 'Location'])['secLength'].agg(lambda x: x.min() - 1).reset_index( name='secLengthsup')) # merge the dfs and sub out the last sector lengths with # all data for each state drop states that don't have suppressed dat df1 = df.merge(df_sup2) df2 = df1[df1['secLength'] == 6].reset_index(drop=True) # determine sector to merge on df2.loc[:, 'mergeSec'] = df2.apply( lambda x: x[sector_column][:x['secLengthsup']], axis=1) sum_cols = [ e for e in fba_default_grouping_fields if e not in ['ActivityConsumedBy', 'ActivityProducedBy'] ] sum_cols.append('mergeSec') df2 = df2.assign( FlowAlloc=df2.groupby(sum_cols)['FlowAmount'].transform('sum')) # rename columns for the merge and define merge cols df2 = df2.rename(columns={ sector_column: 'NewNAICS', 'mergeSec': sector_column }) # keep flows with 0 flow df3 = df2[df2['FlowAmount'] == 0].reset_index(drop=True) m_cols = groupcols + ['NewNAICS', 'FlowAlloc'] # merge the two dfs dfe = df1.merge(df3[m_cols]) # add count column used to divide the unallocated flows dfe = dfe.assign( secCount=dfe.groupby(groupcols)['NewNAICS'].transform('count')) dfe = dfe.assign(newFlow=(dfe['FlowAmount'] - dfe['FlowAlloc']) / dfe['secCount']) # reassign values and drop columns dfe = dfe.assign(FlowAmount=dfe['newFlow']) dfe[sector_column] = dfe['NewNAICS'].copy() dfe = dfe.drop(columns=['NewNAICS', 'FlowAlloc', 'secCount', 'newFlow']) # new df with estimated naics6 dfn = pd.concat([df, dfe], ignore_index=True) dfn2 = dfn[dfn['FlowAmount'] != 0].reset_index(drop=True) dfn2 = dfn2.drop(columns=['secLength']) dff = sector_aggregation(dfn2, fba_wsec_default_grouping_fields) # if activities are source-like, set col values as copies # of the sector columns if sector_like_activities: dff = dff.assign(ActivityProducedBy=dff['SectorProducedBy']) dff = dff.assign(ActivityConsumedBy=dff['SectorConsumedBy']) # reindex columns dff = dff.reindex(df_load.columns, axis=1) # replace null values dff = replace_strings_with_NoneType(dff).reset_index(drop=True) return dff
def sector_disaggregation(df_load): """ function to disaggregate sectors if there is only one naics at a lower level works for lower than naics 4 :param df_load: A FBS df, must have sector columns :return: A FBS df with values for the missing naics5 and naics6 """ # ensure None values are not strings df = replace_NoneType_with_empty_cells(df_load) # determine if activities are sector-like, if aggregating # a df with a 'SourceName' sector_like_activities = False if 'SourceName' in df_load.columns: s = pd.unique(df_load['SourceName'])[0] sector_like_activities = check_activities_sector_like(s) # if activities are source like, drop from df, # add back in as copies of sector columns columns to keep if sector_like_activities: # subset df df_cols = [ e for e in df.columns if e not in ('ActivityProducedBy', 'ActivityConsumedBy') ] df = df[df_cols] # load naics 2 to naics 6 crosswalk cw_load = load_crosswalk('sector_length') # for loop min length to 6 digits, where min length cannot be less than 2 fields_list = [] for i in range(2): if not (df[fbs_activity_fields[i]] == "").all(): fields_list.append(fbs_activity_fields[i]) length = df[fields_list].apply(lambda x: x.str.len()).min().min() if length < 2: length = 2 # appends missing naics levels to df for i in range(length, 6): sector_merge = 'NAICS_' + str(i) sector_add = 'NAICS_' + str(i + 1) # subset the df by naics length cw = cw_load[[sector_merge, sector_add]] # only keep the rows where there is only one value # in sector_add for a value in sector_merge cw = cw.drop_duplicates(subset=[sector_merge], keep=False).reset_index(drop=True) sector_list = cw[sector_merge].values.tolist() # subset df to sectors with length = i and length = i + 1 df_subset = df.loc[ df[fbs_activity_fields[0]].apply(lambda x: i + 1 >= len(x) >= i) | df[fbs_activity_fields[1]].apply(lambda x: i + 1 >= len(x) >= i)] # create new columns that are length i df_subset = df_subset.assign(SectorProduced_tmp=df_subset[ fbs_activity_fields[0]].apply(lambda x: x[0:i])) df_subset = df_subset.assign(SectorConsumed_tmp=df_subset[ fbs_activity_fields[1]].apply(lambda x: x[0:i])) # subset the df to the rows where the tmp sector columns # are in naics list df_subset_1 = df_subset.loc[ (df_subset['SectorProduced_tmp'].isin(sector_list)) & (df_subset['SectorConsumed_tmp'] == "")] df_subset_2 = df_subset.loc[(df_subset['SectorProduced_tmp'] == "") & ( df_subset['SectorConsumed_tmp'].isin(sector_list))] df_subset_3 = df_subset.loc[ (df_subset['SectorProduced_tmp'].isin(sector_list)) & (df_subset['SectorConsumed_tmp'].isin(sector_list))] # concat existing dfs df_subset = pd.concat([df_subset_1, df_subset_2, df_subset_3], sort=False) # drop all rows with duplicate temp values, as a less aggregated # naics exists list of column headers, that if exist in df, should # be aggregated using the weighted avg fxn possible_column_headers = ('Flowable', 'FlowName', 'Unit', 'Context', 'Compartment', 'Location', 'Year', 'SectorProduced_tmp', 'SectorConsumed_tmp') # list of column headers that do exist in the df being subset cols_to_drop = [ e for e in possible_column_headers if e in df_subset.columns.values.tolist() ] df_subset = df_subset.drop_duplicates( subset=cols_to_drop, keep=False).reset_index(drop=True) # merge the naics cw new_naics = pd.merge(df_subset, cw[[sector_merge, sector_add]], how='left', left_on=['SectorProduced_tmp'], right_on=[sector_merge]) new_naics = new_naics.rename(columns={sector_add: "SPB"}) new_naics = new_naics.drop(columns=[sector_merge]) new_naics = pd.merge(new_naics, cw[[sector_merge, sector_add]], how='left', left_on=['SectorConsumed_tmp'], right_on=[sector_merge]) new_naics = new_naics.rename(columns={sector_add: "SCB"}) new_naics = new_naics.drop(columns=[sector_merge]) # drop columns and rename new sector columns new_naics = new_naics.drop(columns=[ "SectorProducedBy", "SectorConsumedBy", "SectorProduced_tmp", "SectorConsumed_tmp" ]) new_naics = new_naics.rename(columns={ "SPB": "SectorProducedBy", "SCB": "SectorConsumedBy" }) # append new naics to df new_naics['SectorConsumedBy'] = \ new_naics['SectorConsumedBy'].replace({np.nan: ""}) new_naics['SectorProducedBy'] = \ new_naics['SectorProducedBy'].replace({np.nan: ""}) new_naics = replace_NoneType_with_empty_cells(new_naics) df = pd.concat([df, new_naics], sort=True, ignore_index=True) # replace blank strings with None df = replace_strings_with_NoneType(df) # if activities are source-like, set col values # as copies of the sector columns if sector_like_activities: df = df.assign(ActivityProducedBy=df['SectorProducedBy']) df = df.assign(ActivityConsumedBy=df['SectorConsumedBy']) # reindex columns df = df.reindex(df_load.columns, axis=1) return df
def convert_statcan_data_to_US_water_use(df, attr, download_FBA_if_missing): """ Use Canadian GDP data to convert 3 digit canadian water use to us water use: - canadian gdp - us gdp :param df: df, FBA format :param attr: dictionary, attribute data from method yaml for activity set :param download_FBA_if_missing: bool, True if would like to download missing FBAs from Data Commons, False if FBAs should be generated locally :return: df, FBA format, flowamounts converted """ # load Canadian GDP data gdp = load_fba_w_standardized_units( datasource='StatCan_GDP', year=attr['allocation_source_year'], flowclass='Money', download_FBA_if_missing=download_FBA_if_missing) # drop 31-33 gdp = gdp[gdp['ActivityProducedBy'] != '31-33'] gdp = gdp.rename(columns={"FlowAmount": "USD"}) # check units before merge compare_df_units(df, gdp) # merge df df_m = pd.merge(df, gdp[['USD', 'ActivityProducedBy']], how='left', left_on='ActivityConsumedBy', right_on='ActivityProducedBy') df_m['USD'] = df_m['USD'].fillna(0) df_m = df_m.drop(columns=["ActivityProducedBy_y"]) df_m = df_m.rename(columns={"ActivityProducedBy_x": "ActivityProducedBy"}) df_m = df_m[df_m['USD'] != 0] # # convert to kg/USD df_m.loc[:, 'FlowAmount'] = df_m['FlowAmount'] / df_m['USD'] df_m.loc[:, 'Unit'] = 'kg/USD' df_m = df_m.drop(columns=["USD"]) # convert Location to US df_m.loc[:, 'Location'] = US_FIPS df_m = assign_fips_location_system(df_m, str(attr['allocation_source_year'])) # load us gdp # load Canadian GDP data us_gdp_load = load_fba_w_standardized_units( datasource='BEA_GDP_GrossOutput', year=attr['allocation_source_year'], flowclass='Money', download_FBA_if_missing=download_FBA_if_missing) # load bea crosswalk cw_load = load_crosswalk('BEA') cw = cw_load[['BEA_2012_Detail_Code', 'NAICS_2012_Code']].drop_duplicates() cw = cw[cw['NAICS_2012_Code'].apply( lambda x: len(str(x)) == 3)].drop_duplicates().reset_index(drop=True) # merge us_gdp = pd.merge(us_gdp_load, cw, how='left', left_on='ActivityProducedBy', right_on='BEA_2012_Detail_Code') us_gdp = us_gdp.drop( columns=['ActivityProducedBy', 'BEA_2012_Detail_Code']) # rename columns us_gdp = us_gdp.rename(columns={'NAICS_2012_Code': 'ActivityProducedBy'}) # agg by naics us_gdp = aggregator(us_gdp, fba_default_grouping_fields) us_gdp = us_gdp.rename(columns={'FlowAmount': 'us_gdp'}) # determine annual us water use df_m2 = pd.merge(df_m, us_gdp[['ActivityProducedBy', 'us_gdp']], how='left', left_on='ActivityConsumedBy', right_on='ActivityProducedBy') df_m2.loc[:, 'FlowAmount'] = df_m2['FlowAmount'] * (df_m2['us_gdp']) df_m2.loc[:, 'Unit'] = 'kg' df_m2 = df_m2.rename( columns={'ActivityProducedBy_x': 'ActivityProducedBy'}) df_m2 = df_m2.drop(columns=['ActivityProducedBy_y', 'us_gdp']) return df_m2