Example #1
0
def modify_thermo_and_aqua_sector_assignments(df):
    """
    Currently, do not have a method to allocate water withdrawal beyond four digits for Thermoelectric and Aquaculture.
    Therefore, after mapping these 2 activities to sectors, drop associated sectors > 4 digits.
    :param df:
    :return:
    """

    from flowsa.flowbyfunctions import replace_NoneType_with_empty_cells, replace_strings_with_NoneType

    activities_to_modify = ("Aquaculture", "Thermoelectric Power")
    max_sector_length = 4

    # if activities are in the activities to modify length and if sector length is greater than max specified, drop rows
    # tmp set None to "" so len(x) fxn woks
    df = replace_NoneType_with_empty_cells(df)
    # set conditions
    c1 = df[fba_activity_fields[0]].isin(activities_to_modify)
    c2 = df[fba_activity_fields[1]].isin(activities_to_modify)
    c3 = df[fbs_activity_fields[0]].apply(lambda x: len(x) > max_sector_length)
    c4 = df[fbs_activity_fields[1]].apply(lambda x: len(x) > max_sector_length)
    # subset data
    df_modified = df.loc[~((c1 | c2) & (c3 | c4))].reset_index(drop=True)
    # set '' back to None
    df_modified = replace_strings_with_NoneType(df_modified)

    return df_modified
Example #2
0
def replace_naics_w_naics_2012(df, sectorsourcename):
    """
    Check if activity-like sectors are in fact sectors. Also works for the Sector column
    :return:
    """
    # test
    # df = mapping.copy()
    # drop NoneType
    df = replace_NoneType_with_empty_cells(df)

    # load the mastercroswalk and subset by sectorsourcename, save values to list
    cw_load = load_sector_crosswalk()
    cw = cw_load[sectorsourcename].drop_duplicates().tolist()

    # load melted crosswalk
    cw_melt = melt_naics_crosswalk()
    # drop the count column
    cw_melt = cw_melt.drop(columns='naics_count')

    # determine which headers are in the df
    possible_column_headers = [
        'Sector', 'SectorProducedBy', 'SectorConsumedBy'
    ]
    # # list of column headers that do exist in the df being aggregated
    column_headers = [
        e for e in possible_column_headers if e in df.columns.values.tolist()
    ]

    # check if there are any sectors that are not in the naics 2012 crosswalk
    non_naics2012 = check_if_sectors_are_naics(df, cw, column_headers)

    # loop through the df headers and determine if value is not in crosswalk list
    if len(non_naics2012) != 0:
        log.info(
            'Checking if sectors represent a different NAICS year, if so, replace with NAICS 2012'
        )
        for c in column_headers:
            # merge df with the melted sector crosswalk
            df = df.merge(cw_melt, left_on=c, right_on='NAICS', how='left')
            # if there is a value in the 'NAICS_2012_Code' column, use that value to replace sector in column c
            df.loc[df[c] == df['NAICS'], c] = df['NAICS_2012_Code']
            # multiply the FlowAmount col by allocation_ratio
            df.loc[df[c] == df['NAICS_2012_Code'],
                   'FlowAmount'] = df['FlowAmount'] * df['allocation_ratio']
            # drop columns
            df = df.drop(
                columns=['NAICS_2012_Code', 'NAICS', 'allocation_ratio'])
        log.info('Replaced NAICS with NAICS 2012 Codes')

        # check if there are any sectors that are not in the naics 2012 crosswalk
        log.info('Check again for non NAICS 2012 Codes')
        check_if_sectors_are_naics(df, cw, column_headers)

    else:
        log.info('No sectors require substitution')

    return df
Example #3
0
def write_naics_2012_crosswalk():
    """
    Create a NAICS 2 - 6 digit crosswalk
    :return:
    """

    # load the useeior mastercrosswalk
    cw_load = import_useeior_mastercrosswalk()

    # extract naics 2012 code column and drop duplicates and empty cells
    cw = cw_load[['NAICS_2012_Code']].drop_duplicates()
    cw = replace_NoneType_with_empty_cells(cw)
    cw = cw[cw['NAICS_2012_Code'] != '']

    # dictionary to replace housing and gov't transport sectors after subsetting by naics length
    dict_replacement = {
        'F0': 'F010',
        'F01': 'F010',
        'F0100': 'F01000',
        'S0': 'S00201',
        'S00': 'S00201',
        'S002': 'S00201',
        'S0020': 'S00201'
    }

    # define sectors that might need to be appended
    house_4 = ['F010']
    house_6 = ['F01000']
    govt = ['S00201']

    # create a list of crosswalk dfs at each length
    cw_list = []
    # extract naics by length
    for i in range(2, 7):
        cw_name = 'cw_' + str(i)
        cw_col = 'NAICS_' + str(i)
        cw_col_m1 = 'NAICS_' + str(i - 1)
        vars()[cw_name] = cw[cw['NAICS_2012_Code'].apply(lambda x: len(x) == i)].\
            reset_index(drop=True).rename(columns={'NAICS_2012_Code': cw_col})
        # address exceptions to naics length rule - housing and gov't sector transport
        vars()[cw_name][cw_col] = vars()[cw_name][cw_col].replace(
            dict_replacement)
        # add some housing/gov't transport sectors, depending on length
        if i in range(2, 4):
            vars()[cw_name] = vars()[cw_name].append(pd.DataFrame(
                house_4, columns=[cw_col]),
                                                     ignore_index=True)
        if i == 5:
            vars()[cw_name] = vars()[cw_name].append(pd.DataFrame(
                house_6, columns=[cw_col]),
                                                     ignore_index=True)
        if i in range(2, 6):
            vars()[cw_name] = vars()[cw_name].append(pd.DataFrame(
                govt, columns=[cw_col]),
                                                     ignore_index=True)
        # add columns to dfs with naics length - 1
        if i in range(3, 7):
            vars()[cw_name][cw_col_m1] = vars()[cw_name][cw_col].apply(
                lambda x: x[0:i - 1])
            # address exceptions to naics length rule - housing and gov't sector transport
            vars()[cw_name][cw_col_m1] = vars()[cw_name][cw_col_m1].replace(
                dict_replacement)
        cw_list.append(vars()[cw_name])

    # loop through the df list and merge
    naics_cw = cw_list[0]
    for df in cw_list[1:5]:
        naics_cw = naics_cw.merge(df, how='outer')

    # save as csv
    naics_cw.to_csv(datapath + "NAICS_2012_Crosswalk.csv", index=False)

    return None
Example #4
0
def check_for_missing_sector_data(df, target_sector_level):
    """
    Modeled after datachecks.py check_if_losing_sector_data
    Allocates flow amount equally across child NAICS when parent NAICS is not target_level
    :param df:
    :param target_sector_level:
    :return:
    """

    from flowsa.flowbyfunctions import replace_NoneType_with_empty_cells, replace_strings_with_NoneType

    # temporarily replace null values with empty cells
    df = replace_NoneType_with_empty_cells(df)

    activity_field = "SectorProducedBy"
    rows_lost = pd.DataFrame()
    cw_load = load_sector_length_crosswalk_w_nonnaics()
    for i in range(3, sector_level_key[target_sector_level]):
        # create df of i length
        df_subset = df.loc[df[activity_field].apply(lambda x: len(x) == i)]

        # import cw and subset to current sector length and target sector length

        nlength = list(sector_level_key.keys())[list(
            sector_level_key.values()).index(i)]
        cw = cw_load[[nlength, target_sector_level]].drop_duplicates()
        # add column with counts
        cw['sector_count'] = cw.groupby(nlength)[nlength].transform('count')

        # merge df & replace sector produced columns
        df_x = pd.merge(df_subset,
                        cw,
                        how='left',
                        left_on=[activity_field],
                        right_on=[nlength])
        df_x[activity_field] = df_x[target_sector_level]
        df_x = df_x.drop(columns=[nlength, target_sector_level])

        # calculate new flow amounts, based on sector count, allocating equally to the new sector length codes
        df_x['FlowAmount'] = df_x['FlowAmount'] / df_x['sector_count']
        df_x = df_x.drop(columns=['sector_count'])
        # replace null values with empty cells
        df_x = replace_NoneType_with_empty_cells(df_x)

        # append to df
        sector_list = df_subset[activity_field].drop_duplicates()
        if len(df_x) != 0:
            log.warning('Data found at ' + str(i) +
                        ' digit NAICS to be allocated'
                        ': {}'.format(' '.join(map(str, sector_list))))
            rows_lost = rows_lost.append(df_x, ignore_index=True, sort=True)

    if len(rows_lost) == 0:
        log.info('No data loss from NAICS in dataframe')
    else:
        log.info('Allocating FlowAmounts equally to each ' +
                 target_sector_level)

    # add rows of missing data to the fbs sector subset
    df_allocated = pd.concat([df, rows_lost], ignore_index=True, sort=True)
    df_allocated = df_allocated.loc[df_allocated[activity_field].apply(
        lambda x: len(x) == sector_level_key[target_sector_level])]
    df_allocated.reset_index(inplace=True)

    # replace empty cells with NoneType (if dtype is object)
    df_allocated = replace_strings_with_NoneType(df_allocated)

    return df_allocated
Example #5
0
def compare_fba_load_and_fbs_output_totals(fba_load, fbs_load, activity_set,
                                           source_name, method_name, attr,
                                           method, mapping_files):
    """
    Function to compare the loaded flowbyactivity total with the final flowbysector output total
    :param df:
    :return:
    """

    from flowsa.flowbyfunctions import subset_df_by_geoscale, sector_aggregation
    from flowsa.common import load_source_catalog
    from flowsa.mapping import map_elementary_flows

    log.info(
        'Comparing loaded FlowByActivity FlowAmount total to subset FlowBySector FlowAmount total'
    )

    # load source catalog
    cat = load_source_catalog()
    src_info = cat[source_name]

    # extract relevant geoscale data or aggregate existing data
    fba = subset_df_by_geoscale(fba_load, attr['allocation_from_scale'],
                                method['target_geoscale'])
    # map loaded fba
    fba = map_elementary_flows(fba, mapping_files, keep_unmapped_rows=True)
    if src_info['sector-like_activities']:
        # if activities are sector-like, run sector aggregation and then subset df to only keep NAICS2
        fba = fba[[
            'Class', 'FlowAmount', 'Unit', 'Context', 'ActivityProducedBy',
            'ActivityConsumedBy', 'Location', 'LocationSystem'
        ]]
        # rename the activity cols to sector cols for purposes of aggregation
        fba = fba.rename(
            columns={
                'ActivityProducedBy': 'SectorProducedBy',
                'ActivityConsumedBy': 'SectorConsumedBy'
            })
        group_cols_agg = [
            'Class', 'Context', 'Unit', 'Location', 'LocationSystem',
            'SectorProducedBy', 'SectorConsumedBy'
        ]
        fba = sector_aggregation(fba, group_cols_agg)
        # subset fba to only include NAICS2
        fba = replace_NoneType_with_empty_cells(fba)
        fba = fba[fba['SectorConsumedBy'].apply(lambda x: len(x) == 2)
                  | fba['SectorProducedBy'].apply(lambda x: len(x) == 2)]
    # subset/agg dfs
    col_subset = [
        'Class', 'FlowAmount', 'Unit', 'Context', 'Location', 'LocationSystem'
    ]
    group_cols = ['Class', 'Unit', 'Context', 'Location', 'LocationSystem']
    # fba
    fba = fba[col_subset]
    fba_agg = aggregator(fba, group_cols).reset_index(drop=True)
    fba_agg.rename(columns={
        'FlowAmount': 'FBA_amount',
        'Unit': 'FBA_unit'
    },
                   inplace=True)

    # fbs
    fbs = fbs_load[col_subset]
    fbs_agg = aggregator(fbs, group_cols)
    fbs_agg.rename(columns={
        'FlowAmount': 'FBS_amount',
        'Unit': 'FBS_unit'
    },
                   inplace=True)

    try:
        # merge FBA and FBS totals
        df_merge = fba_agg.merge(fbs_agg, how='left')
        df_merge['FlowAmount_difference'] = df_merge['FBA_amount'] - df_merge[
            'FBS_amount']
        df_merge['Percent_difference'] = (df_merge['FlowAmount_difference'] /
                                          df_merge['FBA_amount']) * 100

        # reorder
        df_merge = df_merge[[
            'Class', 'Context', 'Location', 'LocationSystem', 'FBA_amount',
            'FBA_unit', 'FBS_amount', 'FBS_unit', 'FlowAmount_difference',
            'Percent_difference'
        ]]
        df_merge = replace_NoneType_with_empty_cells(df_merge)

        # list of contexts
        context_list = df_merge['Context'].to_list()

        # loop through the contexts and print results of comparison
        for i in context_list:
            df_merge_subset = df_merge[df_merge['Context'] == i].reset_index(
                drop=True)
            diff_per = df_merge_subset['Percent_difference'][0]
            # make reporting more manageable
            if abs(diff_per) > 0.001:
                diff_per = round(diff_per, 2)
            else:
                diff_per = round(diff_per, 6)

            diff_units = df_merge_subset['FBS_unit'][0]
            if diff_per > 0:
                log.info('The total FlowBySector FlowAmount for ' +
                         source_name + ' ' + activity_set + ' ' + i + ' is ' +
                         str(abs(diff_per)) +
                         '% less than the total FlowByActivity FlowAmount')
            else:
                log.info('The total FlowBySector FlowAmount for ' +
                         source_name + ' ' + activity_set + ' ' + i + ' is ' +
                         str(abs(diff_per)) +
                         '% more than the total FlowByActivity FlowAmount')

        # save csv to output folder
        log.info(
            'Save the comparison of FlowByActivity load to FlowBySector total FlowAmounts for '
            + activity_set + ' in output folder')
        # output data at all sector lengths
        df_merge.to_csv(outputpath + "FlowBySectorMethodAnalysis/" +
                        method_name + '_' + source_name +
                        "_FBA_total_to_FBS_total_FlowAmount_comparison_" +
                        activity_set + ".csv",
                        index=False)

    except:
        log.info(
            'Error occured when comparing total FlowAmounts for FlowByActivity and FlowBySector'
        )

    return None
Example #6
0
def check_for_differences_between_fba_load_and_fbs_output(
        fba_load, fbs_load, activity_set, source_name, method_name):
    """
    Function to compare the loaded flowbyactivity with the final flowbysector output, checking for data loss
    :param df:
    :return:
    """

    from flowsa.flowbyfunctions import replace_strings_with_NoneType, replace_NoneType_with_empty_cells

    # subset fba df
    fba = fba_load[[
        'Class', 'MetaSources', 'Flowable', 'Unit', 'FlowType',
        'ActivityProducedBy', 'ActivityConsumedBy', 'Context', 'Location',
        'LocationSystem', 'Year', 'FlowAmount'
    ]].drop_duplicates().reset_index(drop=True)
    fba.loc[:, 'Location'] = US_FIPS
    group_cols = [
        'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit',
        'FlowType', 'Context', 'Location', 'LocationSystem', 'Year'
    ]
    fba_agg = aggregator(fba, group_cols)
    fba_agg.rename(columns={'FlowAmount': 'FBA_amount'}, inplace=True)

    # subset fbs df
    fbs = fbs_load[[
        'Class', 'SectorSourceName', 'Flowable', 'Unit', 'FlowType',
        'SectorProducedBy', 'SectorConsumedBy', 'ActivityProducedBy',
        'ActivityConsumedBy', 'Context', 'Location', 'LocationSystem', 'Year',
        'FlowAmount'
    ]].drop_duplicates().reset_index(drop=True)

    fbs = replace_NoneType_with_empty_cells(fbs)

    fbs['ProducedLength'] = fbs['SectorProducedBy'].apply(lambda x: len(x))
    fbs['ConsumedLength'] = fbs['SectorConsumedBy'].apply(lambda x: len(x))
    fbs['SectorLength'] = fbs[['ProducedLength', 'ConsumedLength']].max(axis=1)
    fbs.loc[:, 'Location'] = US_FIPS
    group_cols = [
        'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit',
        'FlowType', 'Context', 'Location', 'LocationSystem', 'Year',
        'SectorLength'
    ]
    fbs_agg = aggregator(fbs, group_cols)
    fbs_agg.rename(columns={'FlowAmount': 'FBS_amount'}, inplace=True)

    # merge compare 1 and compare 2
    df_merge = fba_agg.merge(fbs_agg,
                             left_on=[
                                 'ActivityProducedBy', 'ActivityConsumedBy',
                                 'Flowable', 'Unit', 'FlowType', 'Context',
                                 'Location', 'LocationSystem', 'Year'
                             ],
                             right_on=[
                                 'ActivityProducedBy', 'ActivityConsumedBy',
                                 'Flowable', 'Unit', 'FlowType', 'Context',
                                 'Location', 'LocationSystem', 'Year'
                             ],
                             how='left')
    df_merge['Ratio'] = df_merge['FBS_amount'] / df_merge['FBA_amount']

    # reorder
    df_merge = df_merge[[
        'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit',
        'FlowType', 'Context', 'Location', 'LocationSystem', 'Year',
        'SectorLength', 'FBA_amount', 'FBS_amount', 'Ratio'
    ]]

    # only report difference at sector length <= 6
    comparison = df_merge[df_merge['SectorLength'] <= 6]

    # todo: address the duplicated rows/data that occur for non-naics household sector length

    ua_count1 = len(comparison[comparison['Ratio'] < 0.95])
    log.info(
        'There are ' + str(ua_count1) +
        ' combinations of flowable/context/sector length where the flowbyactivity to flowbysector ratio is < 0.95'
    )
    ua_count2 = len(comparison[comparison['Ratio'] < 0.99])
    log.info(
        'There are ' + str(ua_count2) +
        ' combinations of flowable/context/sector length where the flowbyactivity to flowbysector ratio is < 0.99'
    )
    oa_count1 = len(comparison[comparison['Ratio'] > 1])
    log.info(
        'There are ' + str(oa_count1) +
        ' combinations of flowable/context/sector length where the flowbyactivity to flowbysector ratio is > 1.0'
    )
    oa_count2 = len(comparison[comparison['Ratio'] > 1.01])
    log.info(
        'There are ' + str(oa_count2) +
        ' combinations of flowable/context/sector length where the flowbyactivity to flowbysector ratio is > 1.01'
    )

    # save csv to output folder
    log.info(
        'Save the comparison of FlowByActivity load to FlowBySector ratios for '
        + activity_set + ' in output folder')
    # output data at all sector lengths
    df_merge.to_csv(outputpath + "FlowBySectorMethodAnalysis/" + method_name +
                    '_' + source_name + "_FBA_load_to_FBS_comparison_" +
                    activity_set + ".csv",
                    index=False)

    return None
Example #7
0
def check_if_losing_sector_data(df, target_sector_level):
    """
    Determine rows of data that will be lost if subset data at target sector level
    In some instances, not all
    :param fbs:
    :return:
    """

    # exclude nonsectors
    df = replace_NoneType_with_empty_cells(df)

    rows_lost = pd.DataFrame()
    for i in range(2, sector_level_key[target_sector_level]):
        # create df of i length
        df_x1 = df.loc[
            (df[fbs_activity_fields[0]].apply(lambda x: len(x) == i))
            & (df[fbs_activity_fields[1]] == '')]
        df_x2 = df.loc[(df[fbs_activity_fields[0]] == '') & (
            df[fbs_activity_fields[1]].apply(lambda x: len(x) == i))]
        df_x3 = df.loc[
            (df[fbs_activity_fields[0]].apply(lambda x: len(x) == i))
            & (df[fbs_activity_fields[1]].apply(lambda x: len(x) == i))]
        df_x = pd.concat([df_x1, df_x2, df_x3], ignore_index=True, sort=False)

        # create df of i + 1 length
        df_y1 = df.loc[
            df[fbs_activity_fields[0]].apply(lambda x: len(x) == i + 1)
            | df[fbs_activity_fields[1]].apply(lambda x: len(x) == i + 1)]
        df_y2 = df.loc[
            df[fbs_activity_fields[0]].apply(lambda x: len(x) == i + 1)
            & df[fbs_activity_fields[1]].apply(lambda x: len(x) == i + 1)]
        df_y = pd.concat([df_y1, df_y2], ignore_index=True, sort=False)

        # create temp sector columns in df y, that are i digits in length
        df_y.loc[:, 'spb_tmp'] = df_y[fbs_activity_fields[0]].apply(
            lambda x: x[0:i])
        df_y.loc[:, 'scb_tmp'] = df_y[fbs_activity_fields[1]].apply(
            lambda x: x[0:i])
        # don't modify household sector lengths
        df_y = df_y.replace({'F0': 'F010', 'F01': 'F010'})

        # merge the two dfs
        df_m = pd.merge(df_x,
                        df_y[[
                            'Class', 'Context', 'FlowType', 'Flowable',
                            'Location', 'LocationSystem', 'Unit', 'Year',
                            'spb_tmp', 'scb_tmp'
                        ]],
                        how='left',
                        left_on=[
                            'Class', 'Context', 'FlowType', 'Flowable',
                            'Location', 'LocationSystem', 'Unit', 'Year',
                            'SectorProducedBy', 'SectorConsumedBy'
                        ],
                        right_on=[
                            'Class', 'Context', 'FlowType', 'Flowable',
                            'Location', 'LocationSystem', 'Unit', 'Year',
                            'spb_tmp', 'scb_tmp'
                        ])

        # extract the rows that are not disaggregated to more specific naics
        rl = df_m[(df_m['scb_tmp'].isnull()) & (df_m['spb_tmp'].isnull())]
        # clean df
        rl = clean_df(rl, flow_by_sector_fields, fbs_fill_na_dict)
        rl_list = rl[['SectorProducedBy',
                      'SectorConsumedBy']].drop_duplicates().values.tolist()

        # match sectors with target sector length sectors

        # import cw and subset to current sector length and target sector length
        cw_load = load_sector_length_crosswalk()
        nlength = list(sector_level_key.keys())[list(
            sector_level_key.values()).index(i)]
        cw = cw_load[[nlength, target_sector_level]].drop_duplicates()
        # add column with counts
        cw['sector_count'] = cw.groupby(nlength)[nlength].transform('count')

        # merge df & conditionally replace sector produced/consumed columns
        rl_m = pd.merge(rl,
                        cw,
                        how='left',
                        left_on=[fbs_activity_fields[0]],
                        right_on=[nlength])
        rl_m.loc[rl_m[fbs_activity_fields[0]] != '',
                 fbs_activity_fields[0]] = rl_m[target_sector_level]
        rl_m = rl_m.drop(columns=[nlength, target_sector_level])

        rl_m2 = pd.merge(rl_m,
                         cw,
                         how='left',
                         left_on=[fbs_activity_fields[1]],
                         right_on=[nlength])
        rl_m2.loc[rl_m2[fbs_activity_fields[1]] != '',
                  fbs_activity_fields[1]] = rl_m2[target_sector_level]
        rl_m2 = rl_m2.drop(columns=[nlength, target_sector_level])

        # create one sector count column
        rl_m2['sector_count_x'] = rl_m2['sector_count_x'].fillna(
            rl_m2['sector_count_y'])
        rl_m3 = rl_m2.rename(columns={'sector_count_x': 'sector_count'})
        rl_m3 = rl_m3.drop(columns=['sector_count_y'])

        # calculate new flow amounts, based on sector count, allocating equally to the new sector length codes
        rl_m3['FlowAmount'] = rl_m3['FlowAmount'] / rl_m3['sector_count']
        rl_m3 = rl_m3.drop(columns=['sector_count'])

        # append to df
        if len(rl) != 0:
            log.warning('Data found at ' + str(i) +
                        ' digit NAICS not represented in current '
                        'data subset: {}'.format(' '.join(map(str, rl_list))))
            rows_lost = rows_lost.append(rl_m3, ignore_index=True, sort=True)

    if len(rows_lost) == 0:
        log.info('Data exists at ' + target_sector_level)
    else:
        log.info('Allocating FlowAmounts equally to each ' +
                 target_sector_level +
                 ' associated with the sectors previously dropped')

    # add rows of missing data to the fbs sector subset
    df_w_lost_data = pd.concat([df, rows_lost], ignore_index=True, sort=True)
    df_w_lost_data = replace_strings_with_NoneType(df_w_lost_data)

    return df_w_lost_data
Example #8
0
def disaggregate_cropland(fba_w_sector, attr, method, years_list, sector_column):
    """
    In the event there are 4 (or 5) digit naics for cropland at the county level, use state level harvested cropland to
    create ratios
    :param fba_w_sector:
    :param attr:
    :param years_list:
    :param sector_column: The sector column on which to make df modifications (SectorProducedBy or SectorConsumedBy)
    :param attr:
    :return:
    """

    import flowsa
    from flowsa.flowbyfunctions import sector_aggregation,\
        fbs_default_grouping_fields, clean_df, fba_fill_na_dict, fbs_fill_na_dict, add_missing_flow_by_fields,\
        sector_disaggregation, sector_ratios, replace_strings_with_NoneType, replace_NoneType_with_empty_cells,\
        harmonize_units
    from flowsa.mapping import add_sectors_to_flowbyactivity

    # tmp drop NoneTypes
    fba_w_sector = replace_NoneType_with_empty_cells(fba_w_sector)

    # drop pastureland data
    crop = fba_w_sector.loc[fba_w_sector[sector_column].apply(lambda x: x[0:3]) != '112'].reset_index(drop=True)
    # drop sectors < 4 digits
    crop = crop[crop[sector_column].apply(lambda x: len(x) > 3)].reset_index(drop=True)
    # create tmp location
    crop = crop.assign(Location_tmp=crop['Location'].apply(lambda x: x[0:2]))\

    # load the relevant state level harvested cropland by naics
    naics_load = flowsa.getFlowByActivity(flowclass=['Land'],
                                          years=years_list,
                                          datasource="USDA_CoA_Cropland_NAICS").reset_index(drop=True)
    # clean df
    naics = clean_df(naics_load, flow_by_activity_fields, fba_fill_na_dict)
    naics = harmonize_units(naics)
    # subset the harvested cropland by naics
    naics = naics[naics['FlowName'] == 'AG LAND, CROPLAND, HARVESTED'].reset_index(drop=True)
    # drop the activities that include '&'
    naics = naics[~naics['ActivityConsumedBy'].str.contains('&')].reset_index(drop=True)
    # add sectors
    naics = add_sectors_to_flowbyactivity(naics, sectorsourcename=method['target_sector_source'])
    # add missing fbs fields
    naics = clean_df(naics, flow_by_sector_fields, fbs_fill_na_dict)
    # drop cols and rename
    # naics = naics.drop(columns=["SectorProducedBy"])
    # naics = naics.rename(columns={"SectorConsumedBy": sector_column})

    # aggregate sectors to create any missing naics levels
    group_cols = fbs_default_grouping_fields
    # group_cols = [e for e in group_cols if e not in ('SectorProducedBy', 'SectorConsumedBy')]
    # group_cols.append(sector_column)
    naics2 = sector_aggregation(naics, group_cols)
    # add missing naics5/6 when only one naics5/6 associated with a naics4
    naics3 = sector_disaggregation(naics2, group_cols)
    # drop rows where FlowAmount 0
    # naics3 = naics3[~((naics3['SectorProducedBy'] == '') & (naics3['SectorConsumedBy'] == ''))]
    naics3 = naics3.loc[naics3['FlowAmount'] != 0]
    # create ratios
    naics4 = sector_ratios(naics3, sector_column)
    # create temporary sector column to match the two dfs on
    naics4 = naics4.assign(Location_tmp=naics4['Location'].apply(lambda x: x[0:2]))
    # tmp drop Nonetypes
    naics4 = replace_NoneType_with_empty_cells(naics4)

    # for loop through naics lengths to determine naics 4 and 5 digits to disaggregate
    for i in range(4, 6):
        # subset df to sectors with length = i and length = i + 1
        crop_subset = crop.loc[crop[sector_column].apply(lambda x: i+1 >= len(x) >= i)]
        crop_subset = crop_subset.assign(Sector_tmp=crop_subset[sector_column].apply(lambda x: x[0:i]))
        # if duplicates drop all rows
        df = crop_subset.drop_duplicates(subset=['Location', 'Sector_tmp'], keep=False).reset_index(drop=True)
        # drop sector temp column
        df = df.drop(columns=["Sector_tmp"])
        # subset df to keep the sectors of length i
        df_subset = df.loc[df[sector_column].apply(lambda x: len(x) == i)]
        # subset the naics df where naics length is i + 1
        naics_subset = naics4.loc[naics4[sector_column].apply(lambda x: len(x) == i+1)].reset_index(drop=True)
        naics_subset = naics_subset.assign(Sector_tmp=naics_subset[sector_column].apply(lambda x: x[0:i]))
        # merge the two df based on locations
        df_subset = pd.merge(df_subset, naics_subset[[sector_column, 'FlowAmountRatio', 'Sector_tmp', 'Location_tmp']],
                      how='left', left_on=[sector_column, 'Location_tmp'], right_on=['Sector_tmp', 'Location_tmp'])
        # create flow amounts for the new NAICS based on the flow ratio
        df_subset.loc[:, 'FlowAmount'] = df_subset['FlowAmount'] * df_subset['FlowAmountRatio']
        # drop rows of 0 and na
        df_subset = df_subset[df_subset['FlowAmount'] != 0]
        df_subset = df_subset[~df_subset['FlowAmount'].isna()].reset_index(drop=True)
        # drop columns
        df_subset = df_subset.drop(columns=[sector_column + '_x', 'FlowAmountRatio', 'Sector_tmp'])
        # rename columns
        df_subset = df_subset.rename(columns={sector_column + '_y': sector_column})
        # tmp drop Nonetypes
        df_subset = replace_NoneType_with_empty_cells(df_subset)
        # add new rows of data to crop df
        crop = pd.concat([crop, df_subset], sort=True).reset_index(drop=True)

    # clean up df
    crop = crop.drop(columns=['Location_tmp'])

    # pasture data
    pasture = fba_w_sector.loc[fba_w_sector[sector_column].apply(lambda x: x[0:3]) == '112'].reset_index(drop=True)
    # concat crop and pasture
    fba_w_sector = pd.concat([pasture, crop], sort=True).reset_index(drop=True)

    # fill empty cells with NoneType
    fba_w_sector = replace_strings_with_NoneType(fba_w_sector)

    return fba_w_sector
Example #9
0
def disaggregate_pastureland(fba_w_sector, attr, method, years_list, sector_column):
    """
    The USDA CoA Cropland irrigated pastureland data only links to the 3 digit NAICS '112'. This function uses state
    level CoA 'Land in Farms' to allocate the county level acreage data to 6 digit NAICS.
    :param fba_w_sector: The CoA Cropland dataframe after linked to sectors
    :param attr:
    :param years_list:
    :param sector_column: The sector column on which to make df modifications (SectorProducedBy or SectorConsumedBy)
    :return: The CoA cropland dataframe with disaggregated pastureland data
    """

    import flowsa
    from flowsa.flowbyfunctions import allocate_by_sector, clean_df, flow_by_activity_fields, \
        fba_fill_na_dict, replace_strings_with_NoneType, replace_NoneType_with_empty_cells, \
        fba_mapped_default_grouping_fields, harmonize_units
    from flowsa.mapping import add_sectors_to_flowbyactivity

    # tmp drop NoneTypes
    fba_w_sector = replace_NoneType_with_empty_cells(fba_w_sector)

    # subset the coa data so only pastureland
    p = fba_w_sector.loc[fba_w_sector[sector_column].apply(lambda x: x[0:3]) == '112'].reset_index(drop=True)
    if len(p) != 0:
        # add temp loc column for state fips
        p = p.assign(Location_tmp=p['Location'].apply(lambda x: x[0:2]))
        df_sourcename = pd.unique(p['SourceName'])[0]

        # load usda coa cropland naics
        df_class = ['Land']
        df_years = years_list
        df_allocation = 'USDA_CoA_Cropland_NAICS'
        df_f = flowsa.getFlowByActivity(flowclass=df_class, years=df_years, datasource=df_allocation)
        df_f = clean_df(df_f, flow_by_activity_fields, fba_fill_na_dict)
        df_f = harmonize_units(df_f)
        # subset to land in farms data
        df_f = df_f[df_f['FlowName'] == 'FARM OPERATIONS']
        # subset to rows related to pastureland
        df_f = df_f.loc[df_f['ActivityConsumedBy'].apply(lambda x: x[0:3]) == '112']
        # drop rows with "&'
        df_f = df_f[~df_f['ActivityConsumedBy'].str.contains('&')]
        # create sector columns
        df_f = add_sectors_to_flowbyactivity(df_f, sectorsourcename=method['target_sector_source'])
        # create proportional ratios
        group_cols = fba_mapped_default_grouping_fields
        group_cols = [e for e in group_cols if
                      e not in ('ActivityProducedBy', 'ActivityConsumedBy')]
        df_f = allocate_by_sector(df_f, df_sourcename, df_allocation, 'proportional', group_cols)
        # tmp drop NoneTypes
        df_f = replace_NoneType_with_empty_cells(df_f)
        # drop naics = '11
        df_f = df_f[df_f[sector_column] != '11']
        # drop 000 in location
        df_f = df_f.assign(Location=df_f['Location'].apply(lambda x: x[0:2]))

        # merge the coa pastureland data with land in farm data
        df = p.merge(df_f[[sector_column, 'Location', 'FlowAmountRatio']], how='left',
                     left_on="Location_tmp", right_on="Location")
        # multiply the flowamount by the flowratio
        df.loc[:, 'FlowAmount'] = df['FlowAmount'] * df['FlowAmountRatio']
        # drop columns and rename
        df = df.drop(columns=['Location_tmp', sector_column + '_x', 'Location_y', 'FlowAmountRatio'])
        df = df.rename(columns={sector_column + '_y': sector_column,
                                "Location_x": 'Location'})

        # drop rows where sector = 112 and then concat with original fba_w_sector
        fba_w_sector = fba_w_sector[fba_w_sector[sector_column].apply(lambda x: x[0:3]) != '112'].reset_index(drop=True)
        fba_w_sector = pd.concat([fba_w_sector, df], sort=True).reset_index(drop=True)

        # fill empty cells with NoneType
        fba_w_sector = replace_strings_with_NoneType(fba_w_sector)

    return fba_w_sector