def assign_naics(df): cw_load = load_bea_crosswalk() cw = cw_load[['BEA_2012_Detail_Code', 'NAICS_2012_Code']].drop_duplicates().reset_index(drop=True) # drop all rows with naics >6 cw = cw[cw['NAICS_2012_Code'].apply(lambda x: len(str(x)) == 6)].reset_index(drop=True) df = pd.merge(df, cw, left_on='Activity', right_on='BEA_2012_Detail_Code') df = df.drop(columns=["BEA_2012_Detail_Code"]) df = df.rename(columns={"NAICS_2012_Code": "Sector"}) df['SectorSourceName'] = 'NAICS_2012_Code' return df if __name__ == '__main__': # select years to pull unique activity names years = ['2002'] # df of unique ers activity names df = unique_activity_names('BEA_Make_Table', years) # add manual naics 2012 assignments df = assign_naics(df) # drop any rows where naics12 is 'nan' (because level of detail not needed or to prevent double counting) df.dropna(subset=["Sector"], inplace=True) # assign sector type df['SectorType'] = None # sort df df = order_crosswalk(df) # save as csv df.to_csv(datapath + "activitytosectormapping/" + "Crosswalk_BEA_Make_Table_toNAICS.csv", index=False)
# drop all rows with naics >6 cw = cw[cw['NAICS_2012_Code'].apply(lambda x: len(str(x)) == 6)].reset_index(drop=True) df = pd.merge(df, cw, left_on='Activity', right_on='BEA_2012_Detail_Code') df = df.drop(columns=["BEA_2012_Detail_Code"]) df = df.rename(columns={"NAICS_2012_Code": "Sector"}) df['SectorSourceName'] = 'NAICS_2012_Code' return df if __name__ == '__main__': # select years to pull unique activity names years = ['2002'] # flowclass flowclass = ['Money'] # datasource datasource = 'BEA_Make_Table' # df of unique ers activity names df = unique_activity_names(flowclass, years, datasource) # add manual naics 2012 assignments df = assign_naics(df) # drop any rows where naics12 is 'nan' (because level of detail not needed or to prevent double counting) df.dropna(subset=["Sector"], inplace=True) # assign sector type df['SectorType'] = None # sort df df = order_crosswalk(df) # save as csv df.to_csv(datapath + "activitytosectormapping/" + "Crosswalk_" + datasource + "_toNAICS.csv", index=False)
# write_Crosswalk_Census_CBP.py (scripts) # !/usr/bin/env python3 # coding=utf-8 # [email protected] """ Create a crosswalk for Census CBP to NAICS 2012. Downloaded data is already provided in NAICS """ import pandas as pd from flowsa.common import datapath from scripts.common_scripts import unique_activity_names, order_crosswalk if __name__ == '__main__': # select years to pull unique activity names years = ['2012'] # df of unique ers activity names df = unique_activity_names('Census_CBP', years) # Activity and Sector are the same df['Sector'] = df['Activity'].copy() # modify the sector for activity = '31-33' df.loc[df['Activity'] == '31-33', 'Sector'] = '31' df = df.append(pd.DataFrame( [['Census_CBP', '31-33', '32']], columns=['ActivitySourceName', 'Activity', 'Sector']), sort=True) df = df.append(pd.DataFrame( [['Census_CBP', '31-33', '33']], columns=['ActivitySourceName', 'Activity', 'Sector']), sort=True) # Add additional columns df['SectorSourceName'] = "NAICS_2012_Code" df['SectorType'] = "I"
cw = cw.sort_values(['BEA_2012_Detail_Code', 'NAICS_2012_Code']) df = pd.merge(df, cw, left_on='Activity', right_on='BEA_2012_Detail_Code') df = df.drop(columns=["BEA_2012_Detail_Code"]) df = df.rename(columns={"NAICS_2012_Code": "Sector"}) df['SectorSourceName'] = 'NAICS_2012_Code' return df if __name__ == '__main__': # select years to pull unique activity names year = '2002' # datasource datasource = 'Blackhurst_IO' # df of unique ers activity names df = unique_activity_names(datasource, year) # add manual naics 2012 assignments df = assign_naics(df) # drop any rows where naics12 is 'nan' # (because level of detail not needed or to prevent double counting) df.dropna(subset=["Sector"], inplace=True) # assign sector type df['SectorType'] = None # sort df df = order_crosswalk(df) # save as csv df.to_csv(datapath + "activitytosectormapping/" + "Crosswalk_" + datasource + "_toNAICS.csv", index=False)
# df.loc[df['Activity'] == # 'Summary: Pre-Reform Act Simultaneous Leases, Public Domain and Acquired Lands', # 'Sector'] = '' return df if __name__ == '__main__': # select years to pull unique activity names years = ['2007', '2011', '2012'] # assign datasource datasource = 'BLM_PLS' # df of unique ers activity names df_list = [] for y in years: dfy = unique_activity_names(datasource, y) df_list.append(dfy) df = pd.concat(df_list, ignore_index=True).drop_duplicates() # add manual naics 2012 assignments df = assign_naics(df) # assign sector source name df['SectorSourceName'] = 'NAICS_2012_Code' # drop any rows where naics12 is 'nan' # (because level of detail not needed or to prevent double counting) df.dropna(subset=["Sector"], inplace=True) # assign sector type df['SectorType'] = "I" # sort df df = order_crosswalk(df) # save as csv df.to_csv(datapath + "activitytosectormapping/" + "Crosswalk_" +
# all other animal production: 11299 df.loc[df['Activity'] == 'Animals and products, Other animals and products', 'Sector'] = '112990E' # df.loc[df['Activity'] == 'Animals and products, All other animals and products', 'Sector'] = '112990E1' # df.loc[df['Activity'] == 'Animals and products, Milk pelts', 'Sector'] = '112990E1' df.loc[df['Activity'] == 'Wool', 'Sector'] = '112990F' ################# FORESTRY ##############33 df.loc[df['Activity'] == 'Forest products', 'Sector'] = '11531' return df if __name__ == '__main__': # select years to pull unique activity names years = ['2012', '2017'] # df of unique ers activity names df = unique_activity_names('USDA_ERS_FIWS', years) # add manual naics 2012 assignments df = assign_naics(df) # drop any rows where naics12 is 'nan' (because level of detail not needed or to prevent double counting) df.dropna(subset=["Sector"], inplace=True) # assign sector type df['SectorType'] = None # sort df df = order_crosswalk(df) # save as csv df.to_csv(datapath + "activitytosectormapping/" + "Crosswalk_USDA_ERS_FIWS_toNAICS.csv", index=False)
# write_Crosswalk_USDA_CoA_Cropland_NAICS.py (scripts) # !/usr/bin/env python3 # coding=utf-8 """ Create a crosswalk for CoA Cropland Naics to NAICS 2012. Downloaded data is already provided in NAICS """ import pandas as pd from flowsa.common import datapath from scripts.common_scripts import unique_activity_names, order_crosswalk if __name__ == '__main__': # select years to pull unique activity names years = ['2012', '2017'] # df of unique ers activity names df = unique_activity_names('USDA_CoA_Cropland_NAICS', years) # drop activities with symbol '&' df = df[~df['Activity'].str.contains('&')] # Activity and Sector are the same df['Sector'] = df['Activity'].copy() # modify the sector for activity ranges #df.loc[df['Activity'] == '11193 & 11194 & 11199', 'Sector'] = '11193' df = df.append(pd.DataFrame( [['USDA_CoA_Cropland_NAICS', '11193 & 11194 & 11199', '11193']], columns=['ActivitySourceName', 'Activity', 'Sector']), sort=True) df = df.append(pd.DataFrame( [['USDA_CoA_Cropland_NAICS', '11193 & 11194 & 11199', '11194']], columns=['ActivitySourceName', 'Activity', 'Sector']), sort=True) df = df.append(pd.DataFrame( [['USDA_CoA_Cropland_NAICS', '11193 & 11194 & 11199', '11199']],
df.loc[df['Activity'] == 'MINK, LIVE', 'Sector'] = '112930A' df.loc[df['Activity'] == 'RABBITS, LIVE', 'Sector'] = '112930B' # all other animal production: 11299 df.loc[df['Activity'] == 'ALPACAS', 'Sector'] = '112990A' df.loc[df['Activity'] == 'BISON', 'Sector'] = '112990B' df.loc[df['Activity'] == 'DEER', 'Sector'] = '112990C' df.loc[df['Activity'] == 'ELK', 'Sector'] = '112990D' df.loc[df['Activity'] == 'LLAMAS', 'Sector'] = '112990E' return df if __name__ == '__main__': # select years to pull unique activity names years = ['2012', '2017'] # df of unique ers activity names df = unique_activity_names('USDA_CoA_Livestock', years) # add manual naics 2012 assignments df = assign_naics(df) # drop any rows where naics12 is 'nan' (because level of detail not needed or to prevent double counting) df.dropna(subset=["Sector"], inplace=True) # assign sector type df['SectorType'] = None # sort df df = order_crosswalk(df) # save as csv df.to_csv(datapath + "activitytosectormapping/" + "Crosswalk_USDA_CoA_Livestock_toNAICS.csv", index=False)
cw = cw[cw['NAICS_2012_Code'].apply( lambda x: len(str(x)) == 6)].reset_index(drop=True) cw = cw.sort_values(['BEA_2012_Detail_Code', 'NAICS_2012_Code']) df = pd.merge(df, cw, left_on='Activity', right_on='BEA_2012_Detail_Code') df = df.drop(columns=["BEA_2012_Detail_Code"]) df = df.rename(columns={"NAICS_2012_Code": "Sector"}) df['SectorSourceName'] = 'NAICS_2012_Code' return df if __name__ == '__main__': # select years to pull unique activity names years = ['2002'] # df of unique ers activity names df = unique_activity_names('Blackhurst_IO', years) # add manual naics 2012 assignments df = assign_naics(df) # drop any rows where naics12 is 'nan' (because level of detail not needed or to prevent double counting) df.dropna(subset=["Sector"], inplace=True) # assign sector type df['SectorType'] = None # sort df df = order_crosswalk(df) # save as csv df.to_csv(datapath + "activitytosectormapping/" + "Crosswalk_Blackhurst_IO_toNAICS.csv", index=False)
act_list = unmapped_df['Activity'].values.tolist() # if in the activity list, the sector should be modified so last digit is 0 df['Sector'] = np.where(df['Activity'].isin(act_list), df['Activity'].apply(lambda x: x[0:5]) + '0', df['Sector']) return df if __name__ == '__main__': # select years to pull unique activity names years = ['2002', '2010', '2011', '2012', '2015'] # df of unique ers activity names df = unique_activity_names('BLS_QCEW', years) # Activity and Sector are the same df['Sector'] = df['Activity'].copy() # modify the sector for activity = '31-33' df.loc[df['Activity'] == '31-33', 'Sector'] = '31' df = df.append(pd.DataFrame( [['BLS_QCEW', '31-33', '32']], columns=['ActivitySourceName', 'Activity', 'Sector']), sort=True) df = df.append(pd.DataFrame( [['BLS_QCEW', '31-33', '33']], columns=['ActivitySourceName', 'Activity', 'Sector']), sort=True) df = link_non_bls_naics_to_naics(df) # Add additional columns df['SectorSourceName'] = "NAICS_2012_Code"
# df.loc[df['Activity'] == 'Total', 'Sector'] = '' # df.loc[df['Activity'] == 'Total Groundwater', 'Sector'] = '' # df.loc[df['Activity'] == 'Total Surface', 'Sector'] = '' # assign sector source name df['SectorSourceName'] = 'NAICS_2012_Code' return df if __name__ == '__main__': # select years to pull unique activity names years = ['2010', '2015'] # df of unique ers activity names df = unique_activity_names('USGS_NWIS_WU', years) # add manual naics 2012 assignments df = assign_naics(df) # drop any rows where naics12 is 'nan' (because level of detail not needed or to prevent double counting) df.dropna(subset=["Sector"], inplace=True) # assign sector type df['SectorType'] = 'I' # assign sector type df['ActivitySourceName'] = 'USGS_NWIS_WU' # sort df df = order_crosswalk(df) # save as csv df.to_csv(datapath + "activitytosectormapping/" + "Crosswalk_USGS_NWIS_WU_toNAICS.csv", index=False)
""" Create a crosswalk linking Statistics Canada to NAICS """ import pandas as pd from flowsa.common import datapath from scripts.common_scripts import unique_activity_names, order_crosswalk if __name__ == '__main__': # select years to pull unique activity names years = ['2011', '2015'] # df of unique ers activity names df = unique_activity_names('StatCan_GDP', years) # add manual naics 2012 assignments # Activity and Sector are the same df['Sector'] = df['Activity'].copy() # modify the sector for activity = '31-33' df.loc[df['Activity'] == '31-33', 'Sector'] = '31' df = df.append(pd.DataFrame([['StatCan_GDP', '31-33', '32']], columns=['ActivitySourceName', 'Activity', 'Sector']), sort=True) df = df.append(pd.DataFrame([['StatCan_GDP', '31-33', '33']], columns=['ActivitySourceName', 'Activity', 'Sector']), sort=True) # drop 'Other' and nan df = df[~df['Activity'].isin(['Other', 'nan'])] # Add additional columns df['SectorSourceName'] = "NAICS_2012_Code" df['SectorType'] = "I" # reorder
df.loc[df['Activity'] == 'SWITCHGRASS', 'Sector'] = '111998L' df.loc[df['Activity'] == 'FIELD CROPS, OTHER', 'Sector'] = '111998M' return df if __name__ == '__main__': # select years to pull unique activity names years = ['2012', '2015', '2017', '2018', '2020'] # datasource datasources = ['USDA_ACUP_Pesticide', 'USDA_ACUP_Fertilizer'] # loop through datasources: for d in datasources: # df of unique activity names df_list = [] for y in years: dfy = unique_activity_names(d, y) df_list.append(dfy) df = pd.concat(df_list, ignore_index=True).drop_duplicates() # add manual naics 2012 assignments df = assign_naics(df) # drop any rows where naics12 is 'nan' # (because level of detail not needed or to prevent double counting) df = df.dropna() # assign sector type df['SectorType'] = None # sort df df = order_crosswalk(df) # save as csv df.to_csv(f"{datapath}activitytosectormapping/NAICS_Crosswalk_{d}.csv", index=False)
# !/usr/bin/env python3 # coding=utf-8 # [email protected] """ Create a crosswalk linking Statistics Canada to NAICS """ import pandas as pd from flowsa.common import datapath, fbaoutputpath from scripts.common_scripts import unique_activity_names, order_crosswalk if __name__ == '__main__': # select years to pull unique activity names years = ['2011', '2015'] # df of unique ers activity names df = unique_activity_names('StatCan_IWS_MI', years) # add manual naics 2012 assignments # Activity and Sector are the same df['Sector'] = df['Activity'].copy() # modify the sector for activity = '31-33' df.loc[df['Activity'] == '31-33', 'Sector'] = '31' df = df.append(pd.DataFrame( [['StatCan_IWS_MI', '31-33', '32']], columns=['ActivitySourceName', 'Activity', 'Sector']), sort=True) df = df.append(pd.DataFrame( [['StatCan_IWS_MI', '31-33', '33']], columns=['ActivitySourceName', 'Activity', 'Sector']), sort=True) # drop 'Other' and nan df = df[~df['Activity'].isin(['Other', 'nan'])]
sort=True) # all other animal production: 11299, manually add row df = df.append(pd.DataFrame( [['USGS_WU_Coef', 'Sheep and lambs', 'NAICS_2012_Code', '11299']], columns=[ 'ActivitySourceName', 'Activity', 'SectorSourceName', 'Sector' ]), ignore_index=True, sort=True) return df if __name__ == '__main__': # select unique activity names from file years = ['2005'] df = unique_activity_names('USGS_WU_Coef', years) # add manual naics 2012 assignments df = assign_naics(df) # drop any rows where naics12 is 'nan' (because level of detail not needed or to prevent double counting) df.dropna(subset=["Sector"], inplace=True) # assign sector type df['SectorType'] = None # sort df df = order_crosswalk(df) # save as csv df.to_csv(datapath + "activitytosectormapping/" + "Crosswalk_USGS_WU_Coef_toNAICS.csv", index=False)
df.loc[df['Activity'] == 'VEGETABLE TOTALS', 'Sector'] = '1112' df.loc[df['Activity'] == 'WHEAT', 'Sector'] = '11114' # df.loc[df['Activity'] == 'LETTUCE', 'Sector'] = '' # df.loc[df['Activity'] == 'POTATOES', 'Sector'] = '' # df.loc[df['Activity'] == 'SWEET CORN', 'Sector'] = '' # df.loc[df['Activity'] == 'TOMATOES', 'Sector'] = '' return df if __name__ == '__main__': # select years to pull unique activity names years = ['2013', '2018'] # df of unique ers activity names df = unique_activity_names('USDA_IWMS', years) # add manual naics 2012 assignments df = assign_naics(df) # drop any rows where naics12 is 'nan' (because level of detail not needed or to prevent double counting) df.dropna(subset=["Sector"], inplace=True) # assign sector type df['SectorType'] = None # sort df df = order_crosswalk(df) # save as csv df.to_csv(datapath + "activitytosectormapping/" + "Crosswalk_USDA_IWMS_toNAICS.csv", index=False)