def modify_thermo_and_aqua_sector_assignments(df): """ Currently, do not have a method to allocate water withdrawal beyond four digits for Thermoelectric and Aquaculture. Therefore, after mapping these 2 activities to sectors, drop associated sectors > 4 digits. :param df: :return: """ from flowsa.flowbyfunctions import replace_NoneType_with_empty_cells, replace_strings_with_NoneType activities_to_modify = ("Aquaculture", "Thermoelectric Power") max_sector_length = 4 # if activities are in the activities to modify length and if sector length is greater than max specified, drop rows # tmp set None to "" so len(x) fxn woks df = replace_NoneType_with_empty_cells(df) # set conditions c1 = df[fba_activity_fields[0]].isin(activities_to_modify) c2 = df[fba_activity_fields[1]].isin(activities_to_modify) c3 = df[fbs_activity_fields[0]].apply(lambda x: len(x) > max_sector_length) c4 = df[fbs_activity_fields[1]].apply(lambda x: len(x) > max_sector_length) # subset data df_modified = df.loc[~((c1 | c2) & (c3 | c4))].reset_index(drop=True) # set '' back to None df_modified = replace_strings_with_NoneType(df_modified) return df_modified
def replace_naics_w_naics_2012(df, sectorsourcename): """ Check if activity-like sectors are in fact sectors. Also works for the Sector column :return: """ # test # df = mapping.copy() # drop NoneType df = replace_NoneType_with_empty_cells(df) # load the mastercroswalk and subset by sectorsourcename, save values to list cw_load = load_sector_crosswalk() cw = cw_load[sectorsourcename].drop_duplicates().tolist() # load melted crosswalk cw_melt = melt_naics_crosswalk() # drop the count column cw_melt = cw_melt.drop(columns='naics_count') # determine which headers are in the df possible_column_headers = [ 'Sector', 'SectorProducedBy', 'SectorConsumedBy' ] # # list of column headers that do exist in the df being aggregated column_headers = [ e for e in possible_column_headers if e in df.columns.values.tolist() ] # check if there are any sectors that are not in the naics 2012 crosswalk non_naics2012 = check_if_sectors_are_naics(df, cw, column_headers) # loop through the df headers and determine if value is not in crosswalk list if len(non_naics2012) != 0: log.info( 'Checking if sectors represent a different NAICS year, if so, replace with NAICS 2012' ) for c in column_headers: # merge df with the melted sector crosswalk df = df.merge(cw_melt, left_on=c, right_on='NAICS', how='left') # if there is a value in the 'NAICS_2012_Code' column, use that value to replace sector in column c df.loc[df[c] == df['NAICS'], c] = df['NAICS_2012_Code'] # multiply the FlowAmount col by allocation_ratio df.loc[df[c] == df['NAICS_2012_Code'], 'FlowAmount'] = df['FlowAmount'] * df['allocation_ratio'] # drop columns df = df.drop( columns=['NAICS_2012_Code', 'NAICS', 'allocation_ratio']) log.info('Replaced NAICS with NAICS 2012 Codes') # check if there are any sectors that are not in the naics 2012 crosswalk log.info('Check again for non NAICS 2012 Codes') check_if_sectors_are_naics(df, cw, column_headers) else: log.info('No sectors require substitution') return df
def write_naics_2012_crosswalk(): """ Create a NAICS 2 - 6 digit crosswalk :return: """ # load the useeior mastercrosswalk cw_load = import_useeior_mastercrosswalk() # extract naics 2012 code column and drop duplicates and empty cells cw = cw_load[['NAICS_2012_Code']].drop_duplicates() cw = replace_NoneType_with_empty_cells(cw) cw = cw[cw['NAICS_2012_Code'] != ''] # dictionary to replace housing and gov't transport sectors after subsetting by naics length dict_replacement = { 'F0': 'F010', 'F01': 'F010', 'F0100': 'F01000', 'S0': 'S00201', 'S00': 'S00201', 'S002': 'S00201', 'S0020': 'S00201' } # define sectors that might need to be appended house_4 = ['F010'] house_6 = ['F01000'] govt = ['S00201'] # create a list of crosswalk dfs at each length cw_list = [] # extract naics by length for i in range(2, 7): cw_name = 'cw_' + str(i) cw_col = 'NAICS_' + str(i) cw_col_m1 = 'NAICS_' + str(i - 1) vars()[cw_name] = cw[cw['NAICS_2012_Code'].apply(lambda x: len(x) == i)].\ reset_index(drop=True).rename(columns={'NAICS_2012_Code': cw_col}) # address exceptions to naics length rule - housing and gov't sector transport vars()[cw_name][cw_col] = vars()[cw_name][cw_col].replace( dict_replacement) # add some housing/gov't transport sectors, depending on length if i in range(2, 4): vars()[cw_name] = vars()[cw_name].append(pd.DataFrame( house_4, columns=[cw_col]), ignore_index=True) if i == 5: vars()[cw_name] = vars()[cw_name].append(pd.DataFrame( house_6, columns=[cw_col]), ignore_index=True) if i in range(2, 6): vars()[cw_name] = vars()[cw_name].append(pd.DataFrame( govt, columns=[cw_col]), ignore_index=True) # add columns to dfs with naics length - 1 if i in range(3, 7): vars()[cw_name][cw_col_m1] = vars()[cw_name][cw_col].apply( lambda x: x[0:i - 1]) # address exceptions to naics length rule - housing and gov't sector transport vars()[cw_name][cw_col_m1] = vars()[cw_name][cw_col_m1].replace( dict_replacement) cw_list.append(vars()[cw_name]) # loop through the df list and merge naics_cw = cw_list[0] for df in cw_list[1:5]: naics_cw = naics_cw.merge(df, how='outer') # save as csv naics_cw.to_csv(datapath + "NAICS_2012_Crosswalk.csv", index=False) return None
def check_for_missing_sector_data(df, target_sector_level): """ Modeled after datachecks.py check_if_losing_sector_data Allocates flow amount equally across child NAICS when parent NAICS is not target_level :param df: :param target_sector_level: :return: """ from flowsa.flowbyfunctions import replace_NoneType_with_empty_cells, replace_strings_with_NoneType # temporarily replace null values with empty cells df = replace_NoneType_with_empty_cells(df) activity_field = "SectorProducedBy" rows_lost = pd.DataFrame() cw_load = load_sector_length_crosswalk_w_nonnaics() for i in range(3, sector_level_key[target_sector_level]): # create df of i length df_subset = df.loc[df[activity_field].apply(lambda x: len(x) == i)] # import cw and subset to current sector length and target sector length nlength = list(sector_level_key.keys())[list( sector_level_key.values()).index(i)] cw = cw_load[[nlength, target_sector_level]].drop_duplicates() # add column with counts cw['sector_count'] = cw.groupby(nlength)[nlength].transform('count') # merge df & replace sector produced columns df_x = pd.merge(df_subset, cw, how='left', left_on=[activity_field], right_on=[nlength]) df_x[activity_field] = df_x[target_sector_level] df_x = df_x.drop(columns=[nlength, target_sector_level]) # calculate new flow amounts, based on sector count, allocating equally to the new sector length codes df_x['FlowAmount'] = df_x['FlowAmount'] / df_x['sector_count'] df_x = df_x.drop(columns=['sector_count']) # replace null values with empty cells df_x = replace_NoneType_with_empty_cells(df_x) # append to df sector_list = df_subset[activity_field].drop_duplicates() if len(df_x) != 0: log.warning('Data found at ' + str(i) + ' digit NAICS to be allocated' ': {}'.format(' '.join(map(str, sector_list)))) rows_lost = rows_lost.append(df_x, ignore_index=True, sort=True) if len(rows_lost) == 0: log.info('No data loss from NAICS in dataframe') else: log.info('Allocating FlowAmounts equally to each ' + target_sector_level) # add rows of missing data to the fbs sector subset df_allocated = pd.concat([df, rows_lost], ignore_index=True, sort=True) df_allocated = df_allocated.loc[df_allocated[activity_field].apply( lambda x: len(x) == sector_level_key[target_sector_level])] df_allocated.reset_index(inplace=True) # replace empty cells with NoneType (if dtype is object) df_allocated = replace_strings_with_NoneType(df_allocated) return df_allocated
def compare_fba_load_and_fbs_output_totals(fba_load, fbs_load, activity_set, source_name, method_name, attr, method, mapping_files): """ Function to compare the loaded flowbyactivity total with the final flowbysector output total :param df: :return: """ from flowsa.flowbyfunctions import subset_df_by_geoscale, sector_aggregation from flowsa.common import load_source_catalog from flowsa.mapping import map_elementary_flows log.info( 'Comparing loaded FlowByActivity FlowAmount total to subset FlowBySector FlowAmount total' ) # load source catalog cat = load_source_catalog() src_info = cat[source_name] # extract relevant geoscale data or aggregate existing data fba = subset_df_by_geoscale(fba_load, attr['allocation_from_scale'], method['target_geoscale']) # map loaded fba fba = map_elementary_flows(fba, mapping_files, keep_unmapped_rows=True) if src_info['sector-like_activities']: # if activities are sector-like, run sector aggregation and then subset df to only keep NAICS2 fba = fba[[ 'Class', 'FlowAmount', 'Unit', 'Context', 'ActivityProducedBy', 'ActivityConsumedBy', 'Location', 'LocationSystem' ]] # rename the activity cols to sector cols for purposes of aggregation fba = fba.rename( columns={ 'ActivityProducedBy': 'SectorProducedBy', 'ActivityConsumedBy': 'SectorConsumedBy' }) group_cols_agg = [ 'Class', 'Context', 'Unit', 'Location', 'LocationSystem', 'SectorProducedBy', 'SectorConsumedBy' ] fba = sector_aggregation(fba, group_cols_agg) # subset fba to only include NAICS2 fba = replace_NoneType_with_empty_cells(fba) fba = fba[fba['SectorConsumedBy'].apply(lambda x: len(x) == 2) | fba['SectorProducedBy'].apply(lambda x: len(x) == 2)] # subset/agg dfs col_subset = [ 'Class', 'FlowAmount', 'Unit', 'Context', 'Location', 'LocationSystem' ] group_cols = ['Class', 'Unit', 'Context', 'Location', 'LocationSystem'] # fba fba = fba[col_subset] fba_agg = aggregator(fba, group_cols).reset_index(drop=True) fba_agg.rename(columns={ 'FlowAmount': 'FBA_amount', 'Unit': 'FBA_unit' }, inplace=True) # fbs fbs = fbs_load[col_subset] fbs_agg = aggregator(fbs, group_cols) fbs_agg.rename(columns={ 'FlowAmount': 'FBS_amount', 'Unit': 'FBS_unit' }, inplace=True) try: # merge FBA and FBS totals df_merge = fba_agg.merge(fbs_agg, how='left') df_merge['FlowAmount_difference'] = df_merge['FBA_amount'] - df_merge[ 'FBS_amount'] df_merge['Percent_difference'] = (df_merge['FlowAmount_difference'] / df_merge['FBA_amount']) * 100 # reorder df_merge = df_merge[[ 'Class', 'Context', 'Location', 'LocationSystem', 'FBA_amount', 'FBA_unit', 'FBS_amount', 'FBS_unit', 'FlowAmount_difference', 'Percent_difference' ]] df_merge = replace_NoneType_with_empty_cells(df_merge) # list of contexts context_list = df_merge['Context'].to_list() # loop through the contexts and print results of comparison for i in context_list: df_merge_subset = df_merge[df_merge['Context'] == i].reset_index( drop=True) diff_per = df_merge_subset['Percent_difference'][0] # make reporting more manageable if abs(diff_per) > 0.001: diff_per = round(diff_per, 2) else: diff_per = round(diff_per, 6) diff_units = df_merge_subset['FBS_unit'][0] if diff_per > 0: log.info('The total FlowBySector FlowAmount for ' + source_name + ' ' + activity_set + ' ' + i + ' is ' + str(abs(diff_per)) + '% less than the total FlowByActivity FlowAmount') else: log.info('The total FlowBySector FlowAmount for ' + source_name + ' ' + activity_set + ' ' + i + ' is ' + str(abs(diff_per)) + '% more than the total FlowByActivity FlowAmount') # save csv to output folder log.info( 'Save the comparison of FlowByActivity load to FlowBySector total FlowAmounts for ' + activity_set + ' in output folder') # output data at all sector lengths df_merge.to_csv(outputpath + "FlowBySectorMethodAnalysis/" + method_name + '_' + source_name + "_FBA_total_to_FBS_total_FlowAmount_comparison_" + activity_set + ".csv", index=False) except: log.info( 'Error occured when comparing total FlowAmounts for FlowByActivity and FlowBySector' ) return None
def check_for_differences_between_fba_load_and_fbs_output( fba_load, fbs_load, activity_set, source_name, method_name): """ Function to compare the loaded flowbyactivity with the final flowbysector output, checking for data loss :param df: :return: """ from flowsa.flowbyfunctions import replace_strings_with_NoneType, replace_NoneType_with_empty_cells # subset fba df fba = fba_load[[ 'Class', 'MetaSources', 'Flowable', 'Unit', 'FlowType', 'ActivityProducedBy', 'ActivityConsumedBy', 'Context', 'Location', 'LocationSystem', 'Year', 'FlowAmount' ]].drop_duplicates().reset_index(drop=True) fba.loc[:, 'Location'] = US_FIPS group_cols = [ 'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit', 'FlowType', 'Context', 'Location', 'LocationSystem', 'Year' ] fba_agg = aggregator(fba, group_cols) fba_agg.rename(columns={'FlowAmount': 'FBA_amount'}, inplace=True) # subset fbs df fbs = fbs_load[[ 'Class', 'SectorSourceName', 'Flowable', 'Unit', 'FlowType', 'SectorProducedBy', 'SectorConsumedBy', 'ActivityProducedBy', 'ActivityConsumedBy', 'Context', 'Location', 'LocationSystem', 'Year', 'FlowAmount' ]].drop_duplicates().reset_index(drop=True) fbs = replace_NoneType_with_empty_cells(fbs) fbs['ProducedLength'] = fbs['SectorProducedBy'].apply(lambda x: len(x)) fbs['ConsumedLength'] = fbs['SectorConsumedBy'].apply(lambda x: len(x)) fbs['SectorLength'] = fbs[['ProducedLength', 'ConsumedLength']].max(axis=1) fbs.loc[:, 'Location'] = US_FIPS group_cols = [ 'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit', 'FlowType', 'Context', 'Location', 'LocationSystem', 'Year', 'SectorLength' ] fbs_agg = aggregator(fbs, group_cols) fbs_agg.rename(columns={'FlowAmount': 'FBS_amount'}, inplace=True) # merge compare 1 and compare 2 df_merge = fba_agg.merge(fbs_agg, left_on=[ 'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit', 'FlowType', 'Context', 'Location', 'LocationSystem', 'Year' ], right_on=[ 'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit', 'FlowType', 'Context', 'Location', 'LocationSystem', 'Year' ], how='left') df_merge['Ratio'] = df_merge['FBS_amount'] / df_merge['FBA_amount'] # reorder df_merge = df_merge[[ 'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit', 'FlowType', 'Context', 'Location', 'LocationSystem', 'Year', 'SectorLength', 'FBA_amount', 'FBS_amount', 'Ratio' ]] # only report difference at sector length <= 6 comparison = df_merge[df_merge['SectorLength'] <= 6] # todo: address the duplicated rows/data that occur for non-naics household sector length ua_count1 = len(comparison[comparison['Ratio'] < 0.95]) log.info( 'There are ' + str(ua_count1) + ' combinations of flowable/context/sector length where the flowbyactivity to flowbysector ratio is < 0.95' ) ua_count2 = len(comparison[comparison['Ratio'] < 0.99]) log.info( 'There are ' + str(ua_count2) + ' combinations of flowable/context/sector length where the flowbyactivity to flowbysector ratio is < 0.99' ) oa_count1 = len(comparison[comparison['Ratio'] > 1]) log.info( 'There are ' + str(oa_count1) + ' combinations of flowable/context/sector length where the flowbyactivity to flowbysector ratio is > 1.0' ) oa_count2 = len(comparison[comparison['Ratio'] > 1.01]) log.info( 'There are ' + str(oa_count2) + ' combinations of flowable/context/sector length where the flowbyactivity to flowbysector ratio is > 1.01' ) # save csv to output folder log.info( 'Save the comparison of FlowByActivity load to FlowBySector ratios for ' + activity_set + ' in output folder') # output data at all sector lengths df_merge.to_csv(outputpath + "FlowBySectorMethodAnalysis/" + method_name + '_' + source_name + "_FBA_load_to_FBS_comparison_" + activity_set + ".csv", index=False) return None
def check_if_losing_sector_data(df, target_sector_level): """ Determine rows of data that will be lost if subset data at target sector level In some instances, not all :param fbs: :return: """ # exclude nonsectors df = replace_NoneType_with_empty_cells(df) rows_lost = pd.DataFrame() for i in range(2, sector_level_key[target_sector_level]): # create df of i length df_x1 = df.loc[ (df[fbs_activity_fields[0]].apply(lambda x: len(x) == i)) & (df[fbs_activity_fields[1]] == '')] df_x2 = df.loc[(df[fbs_activity_fields[0]] == '') & ( df[fbs_activity_fields[1]].apply(lambda x: len(x) == i))] df_x3 = df.loc[ (df[fbs_activity_fields[0]].apply(lambda x: len(x) == i)) & (df[fbs_activity_fields[1]].apply(lambda x: len(x) == i))] df_x = pd.concat([df_x1, df_x2, df_x3], ignore_index=True, sort=False) # create df of i + 1 length df_y1 = df.loc[ df[fbs_activity_fields[0]].apply(lambda x: len(x) == i + 1) | df[fbs_activity_fields[1]].apply(lambda x: len(x) == i + 1)] df_y2 = df.loc[ df[fbs_activity_fields[0]].apply(lambda x: len(x) == i + 1) & df[fbs_activity_fields[1]].apply(lambda x: len(x) == i + 1)] df_y = pd.concat([df_y1, df_y2], ignore_index=True, sort=False) # create temp sector columns in df y, that are i digits in length df_y.loc[:, 'spb_tmp'] = df_y[fbs_activity_fields[0]].apply( lambda x: x[0:i]) df_y.loc[:, 'scb_tmp'] = df_y[fbs_activity_fields[1]].apply( lambda x: x[0:i]) # don't modify household sector lengths df_y = df_y.replace({'F0': 'F010', 'F01': 'F010'}) # merge the two dfs df_m = pd.merge(df_x, df_y[[ 'Class', 'Context', 'FlowType', 'Flowable', 'Location', 'LocationSystem', 'Unit', 'Year', 'spb_tmp', 'scb_tmp' ]], how='left', left_on=[ 'Class', 'Context', 'FlowType', 'Flowable', 'Location', 'LocationSystem', 'Unit', 'Year', 'SectorProducedBy', 'SectorConsumedBy' ], right_on=[ 'Class', 'Context', 'FlowType', 'Flowable', 'Location', 'LocationSystem', 'Unit', 'Year', 'spb_tmp', 'scb_tmp' ]) # extract the rows that are not disaggregated to more specific naics rl = df_m[(df_m['scb_tmp'].isnull()) & (df_m['spb_tmp'].isnull())] # clean df rl = clean_df(rl, flow_by_sector_fields, fbs_fill_na_dict) rl_list = rl[['SectorProducedBy', 'SectorConsumedBy']].drop_duplicates().values.tolist() # match sectors with target sector length sectors # import cw and subset to current sector length and target sector length cw_load = load_sector_length_crosswalk() nlength = list(sector_level_key.keys())[list( sector_level_key.values()).index(i)] cw = cw_load[[nlength, target_sector_level]].drop_duplicates() # add column with counts cw['sector_count'] = cw.groupby(nlength)[nlength].transform('count') # merge df & conditionally replace sector produced/consumed columns rl_m = pd.merge(rl, cw, how='left', left_on=[fbs_activity_fields[0]], right_on=[nlength]) rl_m.loc[rl_m[fbs_activity_fields[0]] != '', fbs_activity_fields[0]] = rl_m[target_sector_level] rl_m = rl_m.drop(columns=[nlength, target_sector_level]) rl_m2 = pd.merge(rl_m, cw, how='left', left_on=[fbs_activity_fields[1]], right_on=[nlength]) rl_m2.loc[rl_m2[fbs_activity_fields[1]] != '', fbs_activity_fields[1]] = rl_m2[target_sector_level] rl_m2 = rl_m2.drop(columns=[nlength, target_sector_level]) # create one sector count column rl_m2['sector_count_x'] = rl_m2['sector_count_x'].fillna( rl_m2['sector_count_y']) rl_m3 = rl_m2.rename(columns={'sector_count_x': 'sector_count'}) rl_m3 = rl_m3.drop(columns=['sector_count_y']) # calculate new flow amounts, based on sector count, allocating equally to the new sector length codes rl_m3['FlowAmount'] = rl_m3['FlowAmount'] / rl_m3['sector_count'] rl_m3 = rl_m3.drop(columns=['sector_count']) # append to df if len(rl) != 0: log.warning('Data found at ' + str(i) + ' digit NAICS not represented in current ' 'data subset: {}'.format(' '.join(map(str, rl_list)))) rows_lost = rows_lost.append(rl_m3, ignore_index=True, sort=True) if len(rows_lost) == 0: log.info('Data exists at ' + target_sector_level) else: log.info('Allocating FlowAmounts equally to each ' + target_sector_level + ' associated with the sectors previously dropped') # add rows of missing data to the fbs sector subset df_w_lost_data = pd.concat([df, rows_lost], ignore_index=True, sort=True) df_w_lost_data = replace_strings_with_NoneType(df_w_lost_data) return df_w_lost_data
def disaggregate_cropland(fba_w_sector, attr, method, years_list, sector_column): """ In the event there are 4 (or 5) digit naics for cropland at the county level, use state level harvested cropland to create ratios :param fba_w_sector: :param attr: :param years_list: :param sector_column: The sector column on which to make df modifications (SectorProducedBy or SectorConsumedBy) :param attr: :return: """ import flowsa from flowsa.flowbyfunctions import sector_aggregation,\ fbs_default_grouping_fields, clean_df, fba_fill_na_dict, fbs_fill_na_dict, add_missing_flow_by_fields,\ sector_disaggregation, sector_ratios, replace_strings_with_NoneType, replace_NoneType_with_empty_cells,\ harmonize_units from flowsa.mapping import add_sectors_to_flowbyactivity # tmp drop NoneTypes fba_w_sector = replace_NoneType_with_empty_cells(fba_w_sector) # drop pastureland data crop = fba_w_sector.loc[fba_w_sector[sector_column].apply(lambda x: x[0:3]) != '112'].reset_index(drop=True) # drop sectors < 4 digits crop = crop[crop[sector_column].apply(lambda x: len(x) > 3)].reset_index(drop=True) # create tmp location crop = crop.assign(Location_tmp=crop['Location'].apply(lambda x: x[0:2]))\ # load the relevant state level harvested cropland by naics naics_load = flowsa.getFlowByActivity(flowclass=['Land'], years=years_list, datasource="USDA_CoA_Cropland_NAICS").reset_index(drop=True) # clean df naics = clean_df(naics_load, flow_by_activity_fields, fba_fill_na_dict) naics = harmonize_units(naics) # subset the harvested cropland by naics naics = naics[naics['FlowName'] == 'AG LAND, CROPLAND, HARVESTED'].reset_index(drop=True) # drop the activities that include '&' naics = naics[~naics['ActivityConsumedBy'].str.contains('&')].reset_index(drop=True) # add sectors naics = add_sectors_to_flowbyactivity(naics, sectorsourcename=method['target_sector_source']) # add missing fbs fields naics = clean_df(naics, flow_by_sector_fields, fbs_fill_na_dict) # drop cols and rename # naics = naics.drop(columns=["SectorProducedBy"]) # naics = naics.rename(columns={"SectorConsumedBy": sector_column}) # aggregate sectors to create any missing naics levels group_cols = fbs_default_grouping_fields # group_cols = [e for e in group_cols if e not in ('SectorProducedBy', 'SectorConsumedBy')] # group_cols.append(sector_column) naics2 = sector_aggregation(naics, group_cols) # add missing naics5/6 when only one naics5/6 associated with a naics4 naics3 = sector_disaggregation(naics2, group_cols) # drop rows where FlowAmount 0 # naics3 = naics3[~((naics3['SectorProducedBy'] == '') & (naics3['SectorConsumedBy'] == ''))] naics3 = naics3.loc[naics3['FlowAmount'] != 0] # create ratios naics4 = sector_ratios(naics3, sector_column) # create temporary sector column to match the two dfs on naics4 = naics4.assign(Location_tmp=naics4['Location'].apply(lambda x: x[0:2])) # tmp drop Nonetypes naics4 = replace_NoneType_with_empty_cells(naics4) # for loop through naics lengths to determine naics 4 and 5 digits to disaggregate for i in range(4, 6): # subset df to sectors with length = i and length = i + 1 crop_subset = crop.loc[crop[sector_column].apply(lambda x: i+1 >= len(x) >= i)] crop_subset = crop_subset.assign(Sector_tmp=crop_subset[sector_column].apply(lambda x: x[0:i])) # if duplicates drop all rows df = crop_subset.drop_duplicates(subset=['Location', 'Sector_tmp'], keep=False).reset_index(drop=True) # drop sector temp column df = df.drop(columns=["Sector_tmp"]) # subset df to keep the sectors of length i df_subset = df.loc[df[sector_column].apply(lambda x: len(x) == i)] # subset the naics df where naics length is i + 1 naics_subset = naics4.loc[naics4[sector_column].apply(lambda x: len(x) == i+1)].reset_index(drop=True) naics_subset = naics_subset.assign(Sector_tmp=naics_subset[sector_column].apply(lambda x: x[0:i])) # merge the two df based on locations df_subset = pd.merge(df_subset, naics_subset[[sector_column, 'FlowAmountRatio', 'Sector_tmp', 'Location_tmp']], how='left', left_on=[sector_column, 'Location_tmp'], right_on=['Sector_tmp', 'Location_tmp']) # create flow amounts for the new NAICS based on the flow ratio df_subset.loc[:, 'FlowAmount'] = df_subset['FlowAmount'] * df_subset['FlowAmountRatio'] # drop rows of 0 and na df_subset = df_subset[df_subset['FlowAmount'] != 0] df_subset = df_subset[~df_subset['FlowAmount'].isna()].reset_index(drop=True) # drop columns df_subset = df_subset.drop(columns=[sector_column + '_x', 'FlowAmountRatio', 'Sector_tmp']) # rename columns df_subset = df_subset.rename(columns={sector_column + '_y': sector_column}) # tmp drop Nonetypes df_subset = replace_NoneType_with_empty_cells(df_subset) # add new rows of data to crop df crop = pd.concat([crop, df_subset], sort=True).reset_index(drop=True) # clean up df crop = crop.drop(columns=['Location_tmp']) # pasture data pasture = fba_w_sector.loc[fba_w_sector[sector_column].apply(lambda x: x[0:3]) == '112'].reset_index(drop=True) # concat crop and pasture fba_w_sector = pd.concat([pasture, crop], sort=True).reset_index(drop=True) # fill empty cells with NoneType fba_w_sector = replace_strings_with_NoneType(fba_w_sector) return fba_w_sector
def disaggregate_pastureland(fba_w_sector, attr, method, years_list, sector_column): """ The USDA CoA Cropland irrigated pastureland data only links to the 3 digit NAICS '112'. This function uses state level CoA 'Land in Farms' to allocate the county level acreage data to 6 digit NAICS. :param fba_w_sector: The CoA Cropland dataframe after linked to sectors :param attr: :param years_list: :param sector_column: The sector column on which to make df modifications (SectorProducedBy or SectorConsumedBy) :return: The CoA cropland dataframe with disaggregated pastureland data """ import flowsa from flowsa.flowbyfunctions import allocate_by_sector, clean_df, flow_by_activity_fields, \ fba_fill_na_dict, replace_strings_with_NoneType, replace_NoneType_with_empty_cells, \ fba_mapped_default_grouping_fields, harmonize_units from flowsa.mapping import add_sectors_to_flowbyactivity # tmp drop NoneTypes fba_w_sector = replace_NoneType_with_empty_cells(fba_w_sector) # subset the coa data so only pastureland p = fba_w_sector.loc[fba_w_sector[sector_column].apply(lambda x: x[0:3]) == '112'].reset_index(drop=True) if len(p) != 0: # add temp loc column for state fips p = p.assign(Location_tmp=p['Location'].apply(lambda x: x[0:2])) df_sourcename = pd.unique(p['SourceName'])[0] # load usda coa cropland naics df_class = ['Land'] df_years = years_list df_allocation = 'USDA_CoA_Cropland_NAICS' df_f = flowsa.getFlowByActivity(flowclass=df_class, years=df_years, datasource=df_allocation) df_f = clean_df(df_f, flow_by_activity_fields, fba_fill_na_dict) df_f = harmonize_units(df_f) # subset to land in farms data df_f = df_f[df_f['FlowName'] == 'FARM OPERATIONS'] # subset to rows related to pastureland df_f = df_f.loc[df_f['ActivityConsumedBy'].apply(lambda x: x[0:3]) == '112'] # drop rows with "&' df_f = df_f[~df_f['ActivityConsumedBy'].str.contains('&')] # create sector columns df_f = add_sectors_to_flowbyactivity(df_f, sectorsourcename=method['target_sector_source']) # create proportional ratios group_cols = fba_mapped_default_grouping_fields group_cols = [e for e in group_cols if e not in ('ActivityProducedBy', 'ActivityConsumedBy')] df_f = allocate_by_sector(df_f, df_sourcename, df_allocation, 'proportional', group_cols) # tmp drop NoneTypes df_f = replace_NoneType_with_empty_cells(df_f) # drop naics = '11 df_f = df_f[df_f[sector_column] != '11'] # drop 000 in location df_f = df_f.assign(Location=df_f['Location'].apply(lambda x: x[0:2])) # merge the coa pastureland data with land in farm data df = p.merge(df_f[[sector_column, 'Location', 'FlowAmountRatio']], how='left', left_on="Location_tmp", right_on="Location") # multiply the flowamount by the flowratio df.loc[:, 'FlowAmount'] = df['FlowAmount'] * df['FlowAmountRatio'] # drop columns and rename df = df.drop(columns=['Location_tmp', sector_column + '_x', 'Location_y', 'FlowAmountRatio']) df = df.rename(columns={sector_column + '_y': sector_column, "Location_x": 'Location'}) # drop rows where sector = 112 and then concat with original fba_w_sector fba_w_sector = fba_w_sector[fba_w_sector[sector_column].apply(lambda x: x[0:3]) != '112'].reset_index(drop=True) fba_w_sector = pd.concat([fba_w_sector, df], sort=True).reset_index(drop=True) # fill empty cells with NoneType fba_w_sector = replace_strings_with_NoneType(fba_w_sector) return fba_w_sector