Esempio n. 1
0
def proportional_allocation_by_location(df):
    """
    Creates a proportional allocation based on all the most
    aggregated sectors within a location
    Ensure that sectors are at 2 digit level - can run sector_aggregation()
    prior to using this function
    :param df: df, includes sector columns
    :param sectorcolumn: str, sector column by which to base allocation
    :return: df, with 'FlowAmountRatio' column
    """

    # tmp drop NoneType
    df = replace_NoneType_with_empty_cells(df)

    # find the shortest length sector

    denom_df = df.loc[(df['SectorProducedBy'].apply(lambda x: len(x) == 2)) |
                      (df['SectorConsumedBy'].apply(lambda x: len(x) == 2))]
    denom_df = denom_df.assign(Denominator=denom_df['FlowAmount'].groupby(
        denom_df['Location']).transform('sum'))
    denom_df_2 = denom_df[[
        'Location', 'LocationSystem', 'Year', 'Denominator'
    ]].drop_duplicates()
    # merge the denominator column with fba_w_sector df
    allocation_df = df.merge(denom_df_2, how='left')
    # calculate ratio
    allocation_df.loc[:, 'FlowAmountRatio'] = allocation_df[
        'FlowAmount'] / allocation_df['Denominator']
    allocation_df = allocation_df.drop(columns=['Denominator']).reset_index()

    # add nonetypes
    allocation_df = replace_strings_with_NoneType(allocation_df)

    return allocation_df
Esempio n. 2
0
def proportional_allocation_by_location_and_activity(df, sectorcolumn):
    """
    Creates a proportional allocation within each aggregated sector within a location
    :param df: df with sector columns
    :param sectorcolumn: str, sector column for which to create allocation ratios
    :return: df, with 'FlowAmountRatio' and 'HelperFlow' columns
    """

    # tmp replace NoneTypes with empty cells
    df = replace_NoneType_with_empty_cells(df)

    # denominator summed from highest level of sector grouped by location
    short_length = min(df[sectorcolumn].apply(lambda x: len(str(x))).unique())
    # want to create denominator based on short_length
    denom_df = df.loc[df[sectorcolumn].apply(
        lambda x: len(x) == short_length)].reset_index(drop=True)
    grouping_cols = [
        e for e in [
            'FlowName', 'Location', 'Activity', 'ActivityConsumedBy',
            'ActivityProducedBy'
        ] if e in denom_df.columns.values.tolist()
    ]
    denom_df.loc[:, 'Denominator'] = denom_df.groupby(
        grouping_cols)['HelperFlow'].transform('sum')

    # list of column headers, that if exist in df, should be aggregated using the weighted avg fxn
    possible_column_headers = ('Location', 'LocationSystem', 'Year',
                               'Activity', 'ActivityConsumedBy',
                               'ActivityProducedBy')
    # list of column headers that do exist in the df being aggregated
    column_headers = [
        e for e in possible_column_headers
        if e in denom_df.columns.values.tolist()
    ]
    merge_headers = column_headers.copy()
    column_headers.append('Denominator')
    # create subset of denominator values based on Locations and Activities
    denom_df_2 = denom_df[column_headers].drop_duplicates().reset_index(
        drop=True)
    # merge the denominator column with fba_w_sector df
    allocation_df = df.merge(denom_df_2,
                             how='left',
                             left_on=merge_headers,
                             right_on=merge_headers)
    # calculate ratio
    allocation_df.loc[:, 'FlowAmountRatio'] = \
        allocation_df['HelperFlow'] / allocation_df['Denominator']
    allocation_df = allocation_df.drop(columns=['Denominator']).reset_index(
        drop=True)

    # fill empty cols with NoneType
    allocation_df = replace_strings_with_NoneType(allocation_df)
    # fill na values with 0
    allocation_df['HelperFlow'] = allocation_df['HelperFlow'].fillna(0)

    return allocation_df
Esempio n. 3
0
def write_naics_2012_crosswalk():
    """
    Create a NAICS 2 - 6 digit crosswalk
    :return:
    """

    # load the useeior mastercrosswalk subset to the naics timeseries
    cw_load = load_crosswalk('sector_timeseries')

    # load BEA codes that will act as NAICS
    house = load_crosswalk('household')
    govt = load_crosswalk('government')
    bea = pd.concat([house, govt], ignore_index=True).rename(
        columns={'Code': 'NAICS_2012_Code',
                 'NAICS_Level_to_Use_For': 'secLength'})
    bea = bea[['NAICS_2012_Code', 'secLength']]

    # extract naics 2012 code column and drop duplicates and empty cells
    cw = cw_load[['NAICS_2012_Code']].drop_duplicates()
    cw = replace_NoneType_with_empty_cells(cw)
    cw = cw[cw['NAICS_2012_Code'] != '']
    # also drop the existing household and government codes because not all
    # inclusive and does not conform to NAICS length standards
    cw = cw[~cw['NAICS_2012_Code'].str.startswith(
        tuple(['F0', 'S0']))].reset_index(drop=True)

    # add column of sector length
    cw['secLength'] = cw['NAICS_2012_Code'].apply(
        lambda x: f"NAICS_{str(len(x))}")
    # add bea codes subbing for NAICS
    cw2 = pd.concat([cw, bea], ignore_index=True)

    # create dictionary of dataframes
    d = dict(tuple(cw2.groupby('secLength')))

    for l in range(2, 9):
        d[f'NAICS_{l}'] = d[f'NAICS_{l}'][['NAICS_2012_Code']].reset_index(
            drop=True).rename(
            columns={'NAICS_2012_Code': f'NAICS_{l}'})

    naics_cw = d['NAICS_2']
    for l in range(3, 7):
        naics_cw = (d[f'NAICS_{l}'].assign(temp=d[f'NAICS_{l}'][
            f'NAICS_{l}'].str.extract(
            pat=f"({'|'.join(naics_cw[f'NAICS_{l-1}'])})")).merge(
            naics_cw, how='right', left_on='temp',
            right_on=f'NAICS_{l-1}',
            suffixes=['', '_y'])).drop(columns=['temp'])

    # reorder
    naics_cw = naics_cw.reindex(sorted(naics_cw.columns), axis=1)
    # save as csv
    naics_cw.to_csv(datapath + "NAICS_2012_Crosswalk.csv", index=False)
Esempio n. 4
0
def proportional_allocation(df, attr):
    """
    Creates a proportional allocation based on all the most
    aggregated sectors within a location
    Ensure that sectors are at 2 digit level - can run sector_aggregation()
    prior to using this function
    :param df: df, includes sector columns
    :param attr: dictionary, attributes for an activity set
    :return: df, with 'FlowAmountRatio' column
    """

    # tmp drop NoneType
    df = replace_NoneType_with_empty_cells(df)

    # determine if any additional columns beyond location and sector by which
    # to base allocation ratios
    if 'allocation_merge_columns' in attr:
        groupby_cols = ['Location'] + attr['allocation_merge_columns']
        denom_subset_cols = [
            'Location', 'LocationSystem', 'Year', 'Denominator'
        ] + attr['allocation_merge_columns']
    else:
        groupby_cols = ['Location']
        denom_subset_cols = [
            'Location', 'LocationSystem', 'Year', 'Denominator'
        ]

    denom_df = df.loc[(df['SectorProducedBy'].apply(lambda x: len(x) == 2)) |
                      (df['SectorConsumedBy'].apply(lambda x: len(x) == 2))]

    # generate denominator based on identified groupby cols
    denom_df = denom_df.assign(Denominator=denom_df.groupby(groupby_cols)
                               ['FlowAmount'].transform('sum'))
    # subset select columns by which to generate ratios
    denom_df_2 = denom_df[denom_subset_cols].drop_duplicates()
    # merge the denominator column with fba_w_sector df
    allocation_df = df.merge(denom_df_2, how='left')
    # calculate ratio
    allocation_df.loc[:, 'FlowAmountRatio'] = \
        allocation_df['FlowAmount'] / allocation_df['Denominator']
    allocation_df = allocation_df.drop(columns=['Denominator']).reset_index()

    # add nonetypes
    allocation_df = replace_strings_with_NoneType(allocation_df)

    return allocation_df
Esempio n. 5
0
def aggregator(df, groupbycols):
    """
    Aggregates flowbyactivity or flowbysector 'FlowAmount' column in df and
    generate weighted average values based on FlowAmount values for numeric
    columns
    :param df: df, Either flowbyactivity or flowbysector
    :param groupbycols: list, Either flowbyactivity or flowbysector columns
    :return: df, with aggregated columns
    """

    # reset index
    df = df.reset_index(drop=True)
    # tmp replace null values with empty cells
    df = replace_NoneType_with_empty_cells(df)

    # drop columns with flowamount = 0
    df = df[df['FlowAmount'] != 0]

    # list of column headers, that if exist in df, should be
    # aggregated using the weighted avg fxn
    possible_column_headers = \
        ('Spread', 'Min', 'Max', 'DataReliability', 'TemporalCorrelation',
         'GeographicalCorrelation', 'TechnologicalCorrelation',
         'DataCollection')

    # list of column headers that do exist in the df being aggregated
    column_headers = [
        e for e in possible_column_headers if e in df.columns.values.tolist()
    ]

    df_dfg = df.groupby(groupbycols).agg({'FlowAmount': ['sum']})

    # run through other columns creating weighted average
    for e in column_headers:
        df_dfg[e] = get_weighted_average(df, e, 'FlowAmount', groupbycols)

    df_dfg = df_dfg.reset_index()
    df_dfg.columns = df_dfg.columns.droplevel(level=1)

    # if datatypes are strings, ensure that Null values remain NoneType
    df_dfg = replace_strings_with_NoneType(df_dfg)

    return df_dfg
Esempio n. 6
0
def disaggregate_pastureland(fba_w_sector, attr, method, year, sector_column):
    """
    The USDA CoA Cropland irrigated pastureland data only links
    to the 3 digit NAICS '112'. This function uses state
    level CoA 'Land in Farms' to allocate the county level acreage data to 6 digit NAICS.
    :param fba_w_sector: df, the CoA Cropland dataframe after linked to sectors
    :param attr: dictionary, attribute data from method yaml for activity set
    :param year: str, year of data being disaggregated
    :param sector_column: str, the sector column on which to make df
                          modifications (SectorProducedBy or SectorConsumedBy)
    :return: df, the CoA cropland dataframe with disaggregated pastureland data
    """

    # tmp drop NoneTypes
    fba_w_sector = replace_NoneType_with_empty_cells(fba_w_sector)

    # subset the coa data so only pastureland
    p = fba_w_sector.loc[fba_w_sector[sector_column].apply(lambda x: x[0:3]) ==
                         '112'].reset_index(drop=True)
    if len(p) != 0:
        # add temp loc column for state fips
        p = p.assign(Location_tmp=p['Location'].apply(lambda x: x[0:2]))

        # load usda coa cropland naics
        df_f = load_fba_w_standardized_units(
            datasource='USDA_CoA_Cropland_NAICS', year=year, flowclass='Land')
        # subset to land in farms data
        df_f = df_f[df_f['FlowName'] == 'FARM OPERATIONS']
        # subset to rows related to pastureland
        df_f = df_f.loc[df_f['ActivityConsumedBy'].apply(lambda x: x[0:3]) ==
                        '112']
        # drop rows with "&'
        df_f = df_f[~df_f['ActivityConsumedBy'].str.contains('&')]
        # create sector columns
        df_f = add_sectors_to_flowbyactivity(
            df_f, sectorsourcename=method['target_sector_source'])
        # estimate suppressed data by equal allocation
        df_f = estimate_suppressed_data(df_f, 'SectorConsumedBy', 3,
                                        'USDA_CoA_Cropland_NAICS')
        # create proportional ratios
        group_cols = fba_wsec_default_grouping_fields
        group_cols = [
            e for e in group_cols
            if e not in ('ActivityProducedBy', 'ActivityConsumedBy')
        ]
        df_f = allocate_by_sector(df_f, 'proportional', group_cols)
        # tmp drop NoneTypes
        df_f = replace_NoneType_with_empty_cells(df_f)
        # drop naics = '11
        df_f = df_f[df_f[sector_column] != '11']
        # drop 000 in location
        df_f = df_f.assign(Location=df_f['Location'].apply(lambda x: x[0:2]))

        # check units before merge
        compare_df_units(p, df_f)
        # merge the coa pastureland data with land in farm data
        df = p.merge(df_f[[sector_column, 'Location', 'FlowAmountRatio']],
                     how='left',
                     left_on="Location_tmp",
                     right_on="Location")
        # multiply the flowamount by the flowratio
        df.loc[:, 'FlowAmount'] = df['FlowAmount'] * df['FlowAmountRatio']
        # drop columns and rename
        df = df.drop(columns=[
            'Location_tmp', sector_column +
            '_x', 'Location_y', 'FlowAmountRatio'
        ])
        df = df.rename(columns={
            sector_column + '_y': sector_column,
            "Location_x": 'Location'
        })

        # drop rows where sector = 112 and then concat with original fba_w_sector
        fba_w_sector = fba_w_sector[fba_w_sector[sector_column].apply(
            lambda x: x[0:3]) != '112'].reset_index(drop=True)
        fba_w_sector = pd.concat([fba_w_sector, df],
                                 sort=True).reset_index(drop=True)

        # fill empty cells with NoneType
        fba_w_sector = replace_strings_with_NoneType(fba_w_sector)

    return fba_w_sector
Esempio n. 7
0
def allocate_dropped_sector_data(df_load, target_sector_level):
    """
    Determine rows of data that will be lost if subset data at target sector level
    Equally allocate parent NAICS to child NAICS where child NAICS missing
    :param df: df, FBS format
    :param target_sector_level: str, target NAICS level for FBS output
    :return: df, with all child NAICS at target sector level
    """

    # exclude nonsectors
    df = replace_NoneType_with_empty_cells(df_load)

    rows_lost = pd.DataFrame()
    for i in range(2, sector_level_key[target_sector_level]):
        # create df of i length
        df_x1 = df.loc[
            (df[fbs_activity_fields[0]].apply(lambda x: len(x) == i))
            & (df[fbs_activity_fields[1]] == '')]
        df_x2 = df.loc[(df[fbs_activity_fields[0]] == '') & (
            df[fbs_activity_fields[1]].apply(lambda x: len(x) == i))]
        df_x3 = df.loc[
            (df[fbs_activity_fields[0]].apply(lambda x: len(x) == i))
            & (df[fbs_activity_fields[1]].apply(lambda x: len(x) == i))]
        df_x = pd.concat([df_x1, df_x2, df_x3], ignore_index=True, sort=False)

        # create df of i + 1 length
        df_y1 = df.loc[
            df[fbs_activity_fields[0]].apply(lambda x: len(x) == i + 1)
            | df[fbs_activity_fields[1]].apply(lambda x: len(x) == i + 1)]
        df_y2 = df.loc[
            df[fbs_activity_fields[0]].apply(lambda x: len(x) == i + 1)
            & df[fbs_activity_fields[1]].apply(lambda x: len(x) == i + 1)]
        df_y = pd.concat([df_y1, df_y2], ignore_index=True, sort=False)

        # create temp sector columns in df y, that are i digits in length
        df_y.loc[:, 'spb_tmp'] = df_y[fbs_activity_fields[0]].apply(
            lambda x: x[0:i])
        df_y.loc[:, 'scb_tmp'] = df_y[fbs_activity_fields[1]].apply(
            lambda x: x[0:i])
        # don't modify household sector lengths or gov't transport
        df_y = df_y.replace({'F0': 'F010', 'F01': 'F010'})

        # merge the two dfs
        df_m = pd.merge(df_x,
                        df_y[[
                            'Class', 'Context', 'FlowType', 'Flowable',
                            'Location', 'LocationSystem', 'Unit', 'Year',
                            'spb_tmp', 'scb_tmp'
                        ]],
                        how='left',
                        left_on=[
                            'Class', 'Context', 'FlowType', 'Flowable',
                            'Location', 'LocationSystem', 'Unit', 'Year',
                            'SectorProducedBy', 'SectorConsumedBy'
                        ],
                        right_on=[
                            'Class', 'Context', 'FlowType', 'Flowable',
                            'Location', 'LocationSystem', 'Unit', 'Year',
                            'spb_tmp', 'scb_tmp'
                        ])

        # extract the rows that are not disaggregated to more specific naics
        rl = df_m[(df_m['scb_tmp'].isnull())
                  & (df_m['spb_tmp'].isnull())].reset_index(drop=True)
        # clean df
        rl = replace_strings_with_NoneType(rl)
        rl_list = rl[['SectorProducedBy',
                      'SectorConsumedBy']].drop_duplicates().values.tolist()

        # match sectors with target sector length sectors

        # import cw and subset to current sector length and target sector length
        cw_load = load_sector_length_crosswalk()
        nlength = list(sector_level_key.keys())[list(
            sector_level_key.values()).index(i)]
        cw = cw_load[[nlength, target_sector_level]].drop_duplicates()
        # add column with counts
        cw['sector_count'] = cw.groupby(nlength)[nlength].transform('count')

        # merge df & conditionally replace sector produced/consumed columns
        rl_m = pd.merge(rl,
                        cw,
                        how='left',
                        left_on=[fbs_activity_fields[0]],
                        right_on=[nlength])
        rl_m.loc[rl_m[fbs_activity_fields[0]] != '',
                 fbs_activity_fields[0]] = rl_m[target_sector_level]
        rl_m = rl_m.drop(columns=[nlength, target_sector_level])

        rl_m2 = pd.merge(rl_m,
                         cw,
                         how='left',
                         left_on=[fbs_activity_fields[1]],
                         right_on=[nlength])
        rl_m2.loc[rl_m2[fbs_activity_fields[1]] != '',
                  fbs_activity_fields[1]] = rl_m2[target_sector_level]
        rl_m2 = rl_m2.drop(columns=[nlength, target_sector_level])

        # create one sector count column
        rl_m2['sector_count_x'] = rl_m2['sector_count_x'].fillna(
            rl_m2['sector_count_y'])
        rl_m3 = rl_m2.rename(columns={'sector_count_x': 'sector_count'})
        rl_m3 = rl_m3.drop(columns=['sector_count_y'])

        # calculate new flow amounts, based on sector count,
        # allocating equally to the new sector length codes
        rl_m3['FlowAmount'] = rl_m3['FlowAmount'] / rl_m3['sector_count']
        rl_m3 = rl_m3.drop(columns=['sector_count'])

        # append to df
        if len(rl) != 0:
            vLogDetailed.warning(
                'Data found at %s digit NAICS not represented in current '
                'data subset: {}'.format(' '.join(map(str, rl_list))), str(i))
            rows_lost = rows_lost.append(rl_m3, ignore_index=True)

    if len(rows_lost) != 0:
        vLogDetailed.info(
            'Allocating FlowAmounts equally to each %s associated with '
            'the sectors previously dropped', target_sector_level)

    # add rows of missing data to the fbs sector subset
    df_w_lost_data = pd.concat([df, rows_lost], ignore_index=True, sort=True)
    df_w_lost_data = replace_strings_with_NoneType(df_w_lost_data)

    return df_w_lost_data
Esempio n. 8
0
def estimate_suppressed_data(df, sector_column, naics_level, sourcename):
    """
    Estimate data suppression, by equally allocating parent NAICS values to child NAICS
    :param df: df with sector columns
    :param sector_column: str, column to estimate suppressed data for
    :param naics_level: numeric, indicate at what NAICS length to base
                        estimated suppresed data off (2 - 5)
    :param sourcename: str, sourcename
    :return: df, with estimated suppressed data
    """

    # exclude nonsectors
    df = replace_NoneType_with_empty_cells(df)

    # find the longest length sector
    max_length = max(df[sector_column].apply(lambda x: len(str(x))).unique())
    # loop through starting at naics_level, use most detailed level possible to save time
    for i in range(naics_level, max_length):
        # create df of i length
        df_x = df.loc[df[sector_column].apply(lambda x: len(x) == i)]
        # create df of i + 1 length
        df_y = df.loc[df[sector_column].apply(lambda x: len(x) == i + 1)]
        # create temp sector columns in df y, that are i digits in length
        df_y = df_y.assign(s_tmp=df_y[sector_column].apply(lambda x: x[0:i]))

        # create list of location and temp activity combos that contain a 0
        missing_sectors_df = df_y[df_y['FlowAmount'] == 0]
        missing_sectors_list = missing_sectors_df[['Location',
                                                   's_tmp']].drop_duplicates().values.tolist()
        # subset the y df
        if len(missing_sectors_list) != 0:
            # new df of sectors that start with missing sectors.
            # drop last digit of the sector and sum flows set conditions
            suppressed_list = []
            for q, r, in missing_sectors_list:
                c1 = df_y['Location'] == q
                c2 = df_y['s_tmp'] == r
                # subset data
                suppressed_list.append(df_y.loc[c1 & c2])
            suppressed_sectors = pd.concat(suppressed_list, sort=False, ignore_index=True)
            # add column of existing allocated data for length of i
            suppressed_sectors['alloc_flow'] =\
                suppressed_sectors.groupby(['Location', 's_tmp'])['FlowAmount'].transform('sum')
            # subset further so only keep rows of 0 value
            suppressed_sectors_sub = suppressed_sectors[suppressed_sectors['FlowAmount'] == 0]
            # add count
            suppressed_sectors_sub = \
                suppressed_sectors_sub.assign(sector_count=
                                              suppressed_sectors_sub.groupby(
                                                  ['Location', 's_tmp']
                                              )['s_tmp'].transform('count'))

            # merge suppressed sector subset with df x
            df_m = pd.merge(df_x,
                            suppressed_sectors_sub[['Class', 'Compartment', 'FlowType',
                                                    'FlowName', 'Location', 'LocationSystem',
                                                    'Unit', 'Year', sector_column, 's_tmp',
                                                    'alloc_flow', 'sector_count']],
                            left_on=['Class', 'Compartment', 'FlowType', 'FlowName',
                                     'Location', 'LocationSystem', 'Unit', 'Year', sector_column],
                            right_on=['Class', 'Compartment', 'FlowType', 'FlowName',
                                      'Location', 'LocationSystem', 'Unit', 'Year', 's_tmp'],
                            how='right')
            # drop any rows where flowamount is none
            df_m = df_m[~df_m['FlowAmount'].isna()]
            # calculate estimated flows by subtracting the flow
            # amount already allocated from total flow of
            # sector one level up and divide by number of sectors with suppressed data
            df_m.loc[:, 'FlowAmount'] = \
                (df_m['FlowAmount'] - df_m['alloc_flow']) / df_m['sector_count']
            # only keep the suppressed sector subset activity columns
            df_m = df_m.drop(columns=[sector_column + '_x', 's_tmp', 'alloc_flow', 'sector_count'])
            df_m = df_m.rename(columns={sector_column + '_y': sector_column})
            # reset activity columns
            if load_source_catalog()[sourcename]['sector-like_activities']:
                df_m = df_m.assign(ActivityProducedBy=df_m['SectorProducedBy'])
                df_m = df_m.assign(ActivityConsumedBy=df_m['SectorConsumedBy'])

            # drop the existing rows with suppressed data and append the new estimates from fba df
            modified_df =\
                pd.merge(df, df_m[['FlowName', 'Location', sector_column]],
                         indicator=True,
                         how='outer').query('_merge=="left_only"').drop('_merge', axis=1)
            df = pd.concat([modified_df, df_m], ignore_index=True)
    df_w_estimated_data = replace_strings_with_NoneType(df)

    return df_w_estimated_data
Esempio n. 9
0
def sector_disaggregation(df):
    """
    function to disaggregate sectors if there is only one naics at a lower level
    works for lower than naics 4
    :param df: A FBS df, must have sector columns
    :return: A FBS df with values for the missing naics5 and naics6
    """

    # ensure None values are not strings
    df = replace_NoneType_with_empty_cells(df)

    # load naics 2 to naics 6 crosswalk
    cw_load = load_sector_length_crosswalk()

    # for loop min length to 6 digits, where min length cannot be less than 2
    length = df[[fbs_activity_fields[0], fbs_activity_fields[1]]].apply(
        lambda x: x.str.len()).min().min()
    if length < 2:
        length = 2
    # appends missing naics levels to df
    for i in range(length, 6):
        sector_merge = 'NAICS_' + str(i)
        sector_add = 'NAICS_' + str(i+1)

        # subset the df by naics length
        cw = cw_load[[sector_merge, sector_add]]
        # only keep the rows where there is only one value in sector_add for a value in sector_merge
        cw = cw.drop_duplicates(subset=[sector_merge], keep=False).reset_index(drop=True)
        sector_list = cw[sector_merge].values.tolist()

        # subset df to sectors with length = i and length = i + 1
        df_subset = df.loc[df[fbs_activity_fields[0]].apply(lambda x: i + 1 >= len(x) >= i) |
                           df[fbs_activity_fields[1]].apply(lambda x: i + 1 >= len(x) >= i)]
        # create new columns that are length i
        df_subset = df_subset.assign(SectorProduced_tmp=
                                     df_subset[fbs_activity_fields[0]].apply(lambda x: x[0:i]))
        df_subset = df_subset.assign(SectorConsumed_tmp=
                                     df_subset[fbs_activity_fields[1]].apply(lambda x: x[0:i]))
        # subset the df to the rows where the tmp sector columns are in naics list
        df_subset_1 = df_subset.loc[(df_subset['SectorProduced_tmp'].isin(sector_list)) &
                                    (df_subset['SectorConsumed_tmp'] == "")]
        df_subset_2 = df_subset.loc[(df_subset['SectorProduced_tmp'] == "") &
                                    (df_subset['SectorConsumed_tmp'].isin(sector_list))]
        df_subset_3 = df_subset.loc[(df_subset['SectorProduced_tmp'].isin(sector_list)) &
                                    (df_subset['SectorConsumed_tmp'].isin(sector_list))]
        # concat existing dfs
        df_subset = pd.concat([df_subset_1, df_subset_2, df_subset_3], sort=False)
        # drop all rows with duplicate temp values, as a less aggregated naics exists
        # list of column headers, that if exist in df, should be
        # aggregated using the weighted avg fxn
        possible_column_headers = ('Flowable', 'FlowName', 'Unit', 'Context',
                                   'Compartment', 'Location', 'Year',
                                   'SectorProduced_tmp', 'SectorConsumed_tmp')
        # list of column headers that do exist in the df being subset
        cols_to_drop = [e for e in possible_column_headers if e
                        in df_subset.columns.values.tolist()]

        df_subset = df_subset.drop_duplicates(subset=cols_to_drop,
                                              keep=False).reset_index(drop=True)

        # merge the naics cw
        new_naics = pd.merge(df_subset, cw[[sector_merge, sector_add]],
                             how='left', left_on=['SectorProduced_tmp'], right_on=[sector_merge])
        new_naics = new_naics.rename(columns={sector_add: "SPB"})
        new_naics = new_naics.drop(columns=[sector_merge])
        new_naics = pd.merge(new_naics, cw[[sector_merge, sector_add]],
                             how='left', left_on=['SectorConsumed_tmp'], right_on=[sector_merge])
        new_naics = new_naics.rename(columns={sector_add: "SCB"})
        new_naics = new_naics.drop(columns=[sector_merge])
        # drop columns and rename new sector columns
        new_naics = new_naics.drop(columns=["SectorProducedBy", "SectorConsumedBy",
                                            "SectorProduced_tmp", "SectorConsumed_tmp"])
        new_naics = new_naics.rename(columns={"SPB": "SectorProducedBy",
                                              "SCB": "SectorConsumedBy"})
        # append new naics to df
        new_naics['SectorConsumedBy'] = new_naics['SectorConsumedBy'].replace({np.nan: ""})
        new_naics['SectorProducedBy'] = new_naics['SectorProducedBy'].replace({np.nan: ""})
        new_naics = replace_NoneType_with_empty_cells(new_naics)
        df = pd.concat([df, new_naics], sort=True)
    # replace blank strings with None
    df = replace_strings_with_NoneType(df)

    return df
Esempio n. 10
0
def sector_aggregation(df_load, group_cols):
    """
    Function that checks if a sector length exists, and if not, sums the less aggregated sector
    :param df_load: Either a flowbyactivity df with sectors or a flowbysector df
    :param group_cols: columns by which to aggregate
    :return: df, with aggregated sector values
    """

    # determine if activities are sector-like, if aggregating a df with a 'SourceName'
    sector_like_activities = False
    if 'SourceName' in df_load.columns:
        # load source catalog
        cat = load_source_catalog()
        # for s in pd.unique(flowbyactivity_df['SourceName']):
        s = pd.unique(df_load['SourceName'])[0]
        # load catalog info for source
        src_info = cat[s]
        sector_like_activities = src_info['sector-like_activities']

    # ensure None values are not strings
    df = replace_NoneType_with_empty_cells(df_load)

    # if activities are source like, drop from df and group calls,
    # add back in as copies of sector columns columns to keep
    if sector_like_activities:
        group_cols = [e for e in group_cols if e not in
                      ('ActivityProducedBy', 'ActivityConsumedBy')]
        # subset df
        df_cols = [e for e in df.columns if e not in
                   ('ActivityProducedBy', 'ActivityConsumedBy')]
        df = df[df_cols]

    # find the longest length sector
    length = df[[fbs_activity_fields[0], fbs_activity_fields[1]]].apply(
        lambda x: x.str.len()).max().max()
    length = int(length)
    # for loop in reverse order longest length naics minus 1 to 2
    # appends missing naics levels to df
    for i in range(length - 1, 1, -1):
        # subset df to sectors with length = i and length = i + 1
        df_subset = df.loc[df[fbs_activity_fields[0]].apply(lambda x: i + 1 >= len(x) >= i) |
                           df[fbs_activity_fields[1]].apply(lambda x: i + 1 >= len(x) >= i)]
        # create a list of i digit sectors in df subset
        sector_subset = df_subset[
            ['Location', fbs_activity_fields[0],
             fbs_activity_fields[1]]].drop_duplicates().reset_index(drop=True)
        df_sectors = sector_subset.copy()
        df_sectors.loc[:, 'SectorProducedBy'] = \
            df_sectors['SectorProducedBy'].apply(lambda x: x[0:i])
        df_sectors.loc[:, 'SectorConsumedBy'] = \
            df_sectors['SectorConsumedBy'].apply(lambda x: x[0:i])
        sector_list = df_sectors.drop_duplicates().values.tolist()
        # create a list of sectors that are exactly i digits long
        # where either sector column is i digits in length
        df_existing_1 = \
            sector_subset.loc[(sector_subset['SectorProducedBy'].apply(lambda x: len(x) == i)) |
                              (sector_subset['SectorConsumedBy'].apply(lambda x: len(x) == i))]
        # where both sector columns are i digits in length
        df_existing_2 = \
            sector_subset.loc[(sector_subset['SectorProducedBy'].apply(lambda x: len(x) == i)) &
                              (sector_subset['SectorConsumedBy'].apply(lambda x: len(x) == i))]
        # concat existing dfs
        df_existing = pd.concat([df_existing_1, df_existing_2], sort=False)
        existing_sectors = df_existing.drop_duplicates().dropna().values.tolist()
        # list of sectors of length i that are not in sector list
        missing_sectors = [e for e in sector_list if e not in existing_sectors]
        if len(missing_sectors) != 0:
            # new df of sectors that start with missing sectors.
            # drop last digit of the sector and sum flows
            # set conditions
            agg_sectors_list = []
            for q, r, s in missing_sectors:
                c1 = df_subset['Location'] == q
                c2 = df_subset[fbs_activity_fields[0]].apply(lambda x: x[0:i] == r)
                c3 = df_subset[fbs_activity_fields[1]].apply(lambda x: x[0:i] == s)
                # subset data
                agg_sectors_list.append(df_subset.loc[c1 & c2 & c3])
            agg_sectors = pd.concat(agg_sectors_list, sort=False)
            agg_sectors = agg_sectors.loc[
                (agg_sectors[fbs_activity_fields[0]].apply(lambda x: len(x) > i)) |
                (agg_sectors[fbs_activity_fields[1]].apply(lambda x: len(x) > i))]
            agg_sectors.loc[:, fbs_activity_fields[0]] = agg_sectors[fbs_activity_fields[0]].apply(
                lambda x: x[0:i])
            agg_sectors.loc[:, fbs_activity_fields[1]] = agg_sectors[fbs_activity_fields[1]].apply(
                lambda x: x[0:i])
            # aggregate the new sector flow amounts
            agg_sectors = aggregator(agg_sectors, group_cols)
            # append to df
            agg_sectors = replace_NoneType_with_empty_cells(agg_sectors)
            df = df.append(agg_sectors, sort=False).reset_index(drop=True)

    # manually modify non-NAICS codes that might exist in sector
    df.loc[:, 'SectorConsumedBy'] = np.where(df['SectorConsumedBy'].isin(['F0', 'F01']),
                                             'F010', df['SectorConsumedBy'])  # domestic/household
    df.loc[:, 'SectorProducedBy'] = np.where(df['SectorProducedBy'].isin(['F0', 'F01']),
                                             'F010', df['SectorProducedBy'])  # domestic/household
    # drop any duplicates created by modifying sector codes
    df = df.drop_duplicates()
    # if activities are source-like, set col values as copies of the sector columns
    if sector_like_activities:
        df = df.assign(ActivityProducedBy=df['SectorProducedBy'])
        df = df.assign(ActivityConsumedBy=df['SectorConsumedBy'])
        # reindex columns
        df = df.reindex(df_load.columns, axis=1)
    # replace null values
    df = replace_strings_with_NoneType(df)

    return df
Esempio n. 11
0
def equally_allocate_suppressed_parent_to_child_naics(df_load, sector_column,
                                                      groupcols):
    """
    Estimate data suppression, by equally allocating parent NAICS
    values to child NAICS
    :param df_load: df with sector columns
    :param sector_column: str, column to estimate suppressed data for
    :param groupcols: list, columns to group df by
    :return: df, with estimated suppressed data
    """
    df = sector_disaggregation(df_load)
    df = replace_NoneType_with_empty_cells(df)
    df = df[df[sector_column] != '']

    # determine if activities are sector-like,
    # if aggregating a df with a 'SourceName'
    sector_like_activities = False
    if 'SourceName' in df_load.columns:
        s = pd.unique(df_load['SourceName'])[0]
        sector_like_activities = check_activities_sector_like(s)

    # if activities are source like, drop from df,
    # add back in as copies of sector columns columns to keep
    if sector_like_activities:
        # subset df
        df_cols = [
            e for e in df.columns
            if e not in ('ActivityProducedBy', 'ActivityConsumedBy')
        ]
        df = df[df_cols]
        # drop activity from groupby
        groupcols = [
            e for e in groupcols if e not in
            ['ActivityConsumedBy', 'ActivityProducedBy', 'Description']
        ]

    # load naics 2 to naics 6 crosswalk
    cw_load = load_crosswalk('sector_length')
    cw_melt = cw_load.melt(id_vars=["NAICS_6"],
                           var_name="NAICS_Length",
                           value_name="NAICS_Match").drop(
                               columns=['NAICS_Length']).drop_duplicates()

    df_sup = df[df['FlowAmount'] == 0].reset_index(drop=True)
    # merge the naics cw
    new_naics = pd.merge(df_sup,
                         cw_melt,
                         how='left',
                         left_on=[sector_column],
                         right_on=['NAICS_Match'])
    # drop rows where match is null because no additional naics to add
    new_naics = new_naics.dropna()
    new_naics[sector_column] = new_naics['NAICS_6'].copy()
    new_naics = new_naics.drop(columns=['NAICS_6', 'NAICS_Match'])

    # merge the new naics with the existing df, if data already
    # existed for a NAICS6, keep the original
    dfm = pd.merge(new_naics[groupcols],
                   df,
                   how='left',
                   on=groupcols,
                   indicator=True).query('_merge=="left_only"').drop('_merge',
                                                                     axis=1)
    dfm = replace_NoneType_with_empty_cells(dfm)
    dfm = dfm.fillna(0)
    df = pd.concat([df, dfm], sort=True, ignore_index=True)
    # add length column and subset the data
    # subtract out existing data at NAICS6 from total data
    # at a length where no suppressed data
    df = df.assign(secLength=df[sector_column].apply(lambda x: len(x)))

    # add column for each state of sector length where
    # there are no missing values
    df_sup = df_sup.assign(
        secLength=df_sup[sector_column].apply(lambda x: len(x)))
    df_sup2 = (df_sup.groupby(
        ['FlowName', 'Compartment',
         'Location'])['secLength'].agg(lambda x: x.min() - 1).reset_index(
             name='secLengthsup'))

    # merge the dfs and sub out the last sector lengths with
    # all data for each state drop states that don't have suppressed dat
    df1 = df.merge(df_sup2)

    df2 = df1[df1['secLength'] == 6].reset_index(drop=True)
    # determine sector to merge on
    df2.loc[:, 'mergeSec'] = df2.apply(
        lambda x: x[sector_column][:x['secLengthsup']], axis=1)

    sum_cols = [
        e for e in fba_default_grouping_fields
        if e not in ['ActivityConsumedBy', 'ActivityProducedBy']
    ]
    sum_cols.append('mergeSec')
    df2 = df2.assign(
        FlowAlloc=df2.groupby(sum_cols)['FlowAmount'].transform('sum'))
    # rename columns for the merge and define merge cols
    df2 = df2.rename(columns={
        sector_column: 'NewNAICS',
        'mergeSec': sector_column
    })
    # keep flows with 0 flow
    df3 = df2[df2['FlowAmount'] == 0].reset_index(drop=True)
    m_cols = groupcols + ['NewNAICS', 'FlowAlloc']
    # merge the two dfs
    dfe = df1.merge(df3[m_cols])
    # add count column used to divide the unallocated flows
    dfe = dfe.assign(
        secCount=dfe.groupby(groupcols)['NewNAICS'].transform('count'))
    dfe = dfe.assign(newFlow=(dfe['FlowAmount'] - dfe['FlowAlloc']) /
                     dfe['secCount'])
    # reassign values and drop columns
    dfe = dfe.assign(FlowAmount=dfe['newFlow'])
    dfe[sector_column] = dfe['NewNAICS'].copy()
    dfe = dfe.drop(columns=['NewNAICS', 'FlowAlloc', 'secCount', 'newFlow'])

    # new df with estimated naics6
    dfn = pd.concat([df, dfe], ignore_index=True)
    dfn2 = dfn[dfn['FlowAmount'] != 0].reset_index(drop=True)
    dfn2 = dfn2.drop(columns=['secLength'])

    dff = sector_aggregation(dfn2, fba_wsec_default_grouping_fields)

    # if activities are source-like, set col values as copies
    # of the sector columns
    if sector_like_activities:
        dff = dff.assign(ActivityProducedBy=dff['SectorProducedBy'])
        dff = dff.assign(ActivityConsumedBy=dff['SectorConsumedBy'])
        # reindex columns
        dff = dff.reindex(df_load.columns, axis=1)

    # replace null values
    dff = replace_strings_with_NoneType(dff).reset_index(drop=True)
    return dff
Esempio n. 12
0
def compare_fba_geo_subset_and_fbs_output_totals(fba_load, fbs_load,
                                                 activity_set, source_name,
                                                 source_attr, activity_attr,
                                                 method):
    """
    Function to compare the loaded flowbyactivity total after
    subsetting by activity and geography with the final flowbysector output
    total. Not a direct comparison of the loaded FBA because FBAs are
    modified before being subset by activity for the target sector level
    :param fba_load: df, FBA loaded, before being mapped
    :param fbs_load: df, final FBS df at target sector level
    :param activity_set: str, activity set
    :param source_name: str, source name
    :param source_attr: dictionary, attribute data from method yaml
        for source data
    :param activity_attr: dictionary, attribute data from method yaml
        for activity set
    :param method: dictionary, FBS method yaml
    :return: printout data differences between loaded FBA and FBS output
        totals by location, save results as csv in local directory
    """

    vLog.info('Comparing Flow-By-Activity subset by activity and geography to '
              'the subset Flow-By-Sector FlowAmount total.')

    # determine from scale
    if fips_number_key[source_attr['geoscale_to_use']] < \
            fips_number_key[activity_attr['allocation_from_scale']]:
        from_scale = source_attr['geoscale_to_use']
    else:
        from_scale = activity_attr['allocation_from_scale']

    # extract relevant geoscale data or aggregate existing data
    fba = subset_df_by_geoscale(fba_load, from_scale,
                                method['target_geoscale'])
    if check_activities_sector_like(source_name):
        # if activities are sector-like, run sector aggregation and then
        # subset df to only keep NAICS2
        fba = fba[[
            'Class', 'FlowAmount', 'Unit', 'Context', 'ActivityProducedBy',
            'ActivityConsumedBy', 'Location', 'LocationSystem'
        ]]
        # rename the activity cols to sector cols for purposes of aggregation
        fba = fba.rename(
            columns={
                'ActivityProducedBy': 'SectorProducedBy',
                'ActivityConsumedBy': 'SectorConsumedBy'
            })
        group_cols_agg = [
            'Class', 'Context', 'Unit', 'Location', 'LocationSystem',
            'SectorProducedBy', 'SectorConsumedBy'
        ]
        fba = sector_aggregation(fba, group_cols_agg)
        # subset fba to only include NAICS2
        fba = replace_NoneType_with_empty_cells(fba)
        fba = fba[fba['SectorConsumedBy'].apply(lambda x: len(x) == 2)
                  | fba['SectorProducedBy'].apply(lambda x: len(x) == 2)]
    # subset/agg dfs
    col_subset = [
        'Class', 'FlowAmount', 'Unit', 'Context', 'Location', 'LocationSystem'
    ]
    group_cols = ['Class', 'Unit', 'Context', 'Location', 'LocationSystem']
    # check units
    compare_df_units(fba, fbs_load)
    # fba
    fba = fba[col_subset]
    fba_agg = aggregator(fba, group_cols).reset_index(drop=True)
    fba_agg.rename(columns={
        'FlowAmount': 'FBA_amount',
        'Unit': 'FBA_unit'
    },
                   inplace=True)

    # fbs
    fbs = fbs_load[col_subset]
    fbs_agg = aggregator(fbs, group_cols)
    fbs_agg.rename(columns={
        'FlowAmount': 'FBS_amount',
        'Unit': 'FBS_unit'
    },
                   inplace=True)

    try:
        # merge FBA and FBS totals
        df_merge = fba_agg.merge(fbs_agg, how='left')
        df_merge['FlowAmount_difference'] = \
            df_merge['FBA_amount'] - df_merge['FBS_amount']
        df_merge['Percent_difference'] = \
            (df_merge['FlowAmount_difference']/df_merge['FBA_amount']) * 100

        # reorder
        df_merge = df_merge[[
            'Class', 'Context', 'Location', 'LocationSystem', 'FBA_amount',
            'FBA_unit', 'FBS_amount', 'FBS_unit', 'FlowAmount_difference',
            'Percent_difference'
        ]]
        df_merge = replace_NoneType_with_empty_cells(df_merge)

        # list of contexts and locations
        context_list = df_merge[['Context', 'Location']].values.tolist()

        # loop through the contexts and print results of comparison
        vLog.info(
            'Comparing FBA %s %s subset to FBS results. '
            'Details in Validation Log', activity_set,
            source_attr['geoscale_to_use'])
        for i, j in context_list:
            df_merge_subset = \
                df_merge[(df_merge['Context'] == i) &
                         (df_merge['Location'] == j)].reset_index(drop=True)
            diff_per = df_merge_subset['Percent_difference'][0]
            if np.isnan(diff_per):
                vLog.info(
                    'FlowBySector FlowAmount for %s %s %s '
                    'does not exist in the FBS', source_name, activity_set, i)
                continue
            # make reporting more manageable
            if abs(diff_per) > 0.01:
                diff_per = round(diff_per, 2)
            else:
                diff_per = round(diff_per, 6)

            # diff_units = df_merge_subset['FBS_unit'][0]
            if diff_per > 0:
                vLog.info(
                    'FlowBySector FlowAmount for %s %s %s at %s is %s%% '
                    'less than the FlowByActivity FlowAmount', source_name,
                    activity_set, i, j, str(abs(diff_per)))
            elif diff_per < 0:
                vLog.info(
                    'FlowBySector FlowAmount for %s %s %s at %s is %s%% '
                    'more than the FlowByActivity FlowAmount', source_name,
                    activity_set, i, j, str(abs(diff_per)))
            elif diff_per == 0:
                vLogDetailed.info(
                    'FlowBySector FlowAmount for '
                    '%s %s %s at %s is equal to the '
                    'FlowByActivity FlowAmount', source_name, activity_set, i,
                    j)

        # subset the df to include in the validation log
        # only print rows where the percent difference does not round to 0
        df_v = df_merge[df_merge['Percent_difference'].apply(
            lambda x: round(x, 3) != 0)].reset_index(drop=True)

        # log output
        log.info(
            'Save the comparison of FlowByActivity load to FlowBySector '
            'total FlowAmounts for %s in validation log file', activity_set)
        # if df not empty, print, if empty, print string
        if df_v.empty:
            vLogDetailed.info('Percent difference for %s all round to 0',
                              activity_set)
        else:
            vLogDetailed.info(
                'Comparison of FBA load to FBS total '
                'FlowAmounts for %s: '
                '\n {}'.format(df_v.to_string()), activity_set)
    except:
        vLog.info('Error occurred when comparing total FlowAmounts '
                  'for FlowByActivity and FlowBySector')
Esempio n. 13
0
def determine_flows_requiring_disaggregation(df_load, attr, method,
                                             sector_column):
    """
    The MECS Land data provides FlowAmounts for NAICS3-6. We use BLS QCEW
    employment data to determine land use for different industries. To
    accurately estimate land use per industry, existing FlowAmounts for a
    particular NAICS level (NAICS6) for example, should be subtracted from
    the possible FlowAmounts for other NAICS6 that share the first 5 digits.
    For Example, there is data for '311', '3112', and '311221' in the 2014
    dataset. FlowAmounts for allocation by employment for NAICS6 are based
    on the provided '3112' FlowAmounts. However, since there is data at one
    NAICS6 (311221), the FlowAmount for that NAICS6 should be subtracted
    from other NAICS6 to accurately depict the remaining 'FlowAmount' that
    requires a secondary source (Employment data) for allocation.
    :param df_load: df, EIA MECS Land FBA
    :param attr: dictionary, attribute data from method yaml for activity set
    :param method: dictionary, FBS method yaml
    :param sector_column: str, sector column to flag ('SectorProducedBy',
        'SectorConsumedBy')
    :return: A dataframe with a column 'disaggregate_flag', if '1',
             row requires secondary source to calculate
             FlowAmount, if '0' FlowAmount does not require modifications
    """

    from flowsa.sectormapping import add_sectors_to_flowbyactivity

    df_load = replace_NoneType_with_empty_cells(df_load)
    # drop rows where there is no value in sector column, which might occur if
    # sector-like activities have a "-" in them
    df_load = df_load[df_load[sector_column] != '']

    # determine activity column
    if sector_column == 'SectorConsumedBy':
        activity_column = 'ActivityConsumedBy'
    else:
        activity_column = 'ActivityProducedBy'

    # original df - subset
    # subset cols of original df
    dfo = df_load[['FlowAmount', 'Location', sector_column]]
    # min and max length
    min_length = min(
        df_load[sector_column].apply(lambda x: len(str(x))).unique())
    max_length = max(
        df_load[sector_column].apply(lambda x: len(str(x))).unique())
    # subset by sector length, creating a df
    for s in range(min_length, max_length + 1):
        df_name = 'dfo_naics' + str(s)
        vars()[df_name] = dfo[dfo[sector_column].apply(
            lambda x: len(x) == s)].reset_index(drop=True)
        vars()[df_name] = vars()[df_name].assign(SectorMatch=vars(
        )[df_name][sector_column].apply(lambda x: x[:len(x) - 1]))
    # loop through the dfs, merging by sector match. If there is a match,
    # subtract the value, if there is not a match, drop last digit in
    # sectormatch, add row to the next df, and repeat
    df_merged = pd.DataFrame()
    df_not_merged = pd.DataFrame()
    for s in range(max_length, min_length, -1):
        df_name_1 = 'dfo_naics' + str(s - 1)
        df_name_2 = 'dfo_naics' + str(s)
        # concat df 1 with df_not_merged
        df2 = pd.concat([vars()[df_name_2], df_not_merged])
        df2 = df2.rename(columns={
            'FlowAmount': 'SubtractFlow',
            sector_column: 'Sector'
        })
        df_m = pd.merge(
            vars()[df_name_1][['FlowAmount', 'Location', sector_column]],
            df2,
            left_on=['Location', sector_column],
            right_on=['Location', 'SectorMatch'],
            indicator=True,
            how='outer')
        # subset by merge and append to appropriate df
        df_both = df_m[df_m['_merge'] == 'both']
        if len(df_both) != 0:
            # drop columns
            df_both1 = df_both.drop(
                columns=['Sector', 'SectorMatch', '_merge'])
            # aggregate before subtracting
            df_both2 = df_both1.groupby(
                ['FlowAmount', 'Location', sector_column],
                as_index=False).agg({"SubtractFlow": sum})
            df_both3 = df_both2.assign(FlowAmount=df_both2['FlowAmount'] -
                                       df_both2['SubtractFlow'])
            df_both3 = df_both3.drop(columns=['SubtractFlow'])
            # drop rows where 0
            # df_both = df_both[df_both['FlowAmount'] != 0]
            df_merged = df_merged.append(df_both3, ignore_index=True)
        df_right = df_m[df_m['_merge'] == 'right_only']
        if len(df_right) != 0:
            df_right = df_right.drop(
                columns=['FlowAmount', sector_column, '_merge'])
            df_right = df_right.rename(columns={
                'SubtractFlow': 'FlowAmount',
                'Sector': sector_column
            })
            # remove another digit from Sectormatch
            df_right = df_right.assign(SectorMatch=df_right[sector_column].
                                       apply(lambda x: x[:(s - 2)]))
            # reorder
            df_right = df_right[[
                'FlowAmount', 'Location', sector_column, 'SectorMatch'
            ]]
            df_not_merged = df_not_merged.append(df_right, ignore_index=True)
    # rename the flowamount column
    df_merged = df_merged.rename(columns={
        'FlowAmount': 'FlowAmountNew',
        sector_column: activity_column
    })
    # In the original EIA MECS df, some of the NAICS 6-digit codes sum
    # to a value greater than published NAICS3, due to rounding. In these
    # cases, the new FlowAmount is a negative number. Reset neg numbers to 0
    df_merged.loc[df_merged['FlowAmountNew'] < 0, 'FlowAmountNew'] = 0
    # in the original df, drop sector columns re-add sectors, this time with
    # sectors = 'aggregated'
    dfn = df_load.drop(columns=[
        'SectorProducedBy', 'ProducedBySectorType', 'SectorConsumedBy',
        'ConsumedBySectorType', 'SectorSourceName'
    ])
    dfn = add_sectors_to_flowbyactivity(
        dfn,
        sectorsourcename=method['target_sector_source'],
        overwrite_sectorlevel='aggregated')
    # add column noting that these columns require an allocation ratio
    dfn = dfn.assign(disaggregate_flag=1)
    # create lists of sectors to drop
    list_original = df_load[activity_column].drop_duplicates().tolist()
    # drop values in original df
    dfn2 = dfn[~dfn[sector_column].isin(list_original)].sort_values(
        [activity_column, sector_column]).reset_index(drop=True)
    # drop the sectors that are duplicated by different naics being
    # mapped to naics6
    if len(dfn2[dfn2.duplicated(subset=['Location', sector_column],
                                keep=False)]) > 0:
        dfn2.drop_duplicates(subset=['Location', sector_column],
                             keep='last',
                             inplace=True)
    # want to allocate at NAICS6, so drop all other sectors
    dfn2 = \
        dfn2[dfn2[sector_column].apply(lambda x: len(x) == 6)].reset_index(
            drop=True).sort_values([sector_column])

    # merge revised flowamounts back with modified original df
    df_to_allocate = dfn2.merge(df_merged, how='left')
    # replace FlowAmount with newly calculated FlowAmount,
    # which represents Flows that are currently unaccounted for at NAICS6
    df_to_allocate['FlowAmount'] = np.where(
        df_to_allocate['FlowAmountNew'].notnull(),
        df_to_allocate['FlowAmountNew'], df_to_allocate['FlowAmount'])
    # drop rows where flow amount = 0 - flows are captured through other NAICS6
    df_to_allocate2 = df_to_allocate[df_to_allocate['FlowAmount'] != 0].drop(
        columns='FlowAmountNew').reset_index(drop=True)

    # merge the original df with modified
    # add column to original df for disaggregate_flag
    df_load = df_load.assign(disaggregate_flag=0)

    # concat the two dfs and sort
    df_c = pd.concat([df_load, df_to_allocate2],
                     ignore_index=True).sort_values([sector_column
                                                     ]).reset_index(drop=True)

    df_c = replace_strings_with_NoneType(df_c).sort_values([sector_column])

    return df_c
Esempio n. 14
0
def proportional_allocation_by_location_and_activity(df_load, sectorcolumn):
    """
    Creates a proportional allocation within each aggregated
    sector within a location
    :param df_load: df with sector columns
    :param sectorcolumn: str, sector column for which to create
         allocation ratios
    :return: df, with 'FlowAmountRatio' and 'HelperFlow' columns
    """

    # tmp replace NoneTypes with empty cells
    df = replace_NoneType_with_empty_cells(df_load).reset_index(drop=True)

    # want to create denominator based on shortest length naics for each
    # activity/location
    grouping_cols = [
        e for e in [
            'FlowName', 'Location', 'Activity', 'ActivityConsumedBy',
            'ActivityProducedBy', 'Class', 'SourceName', 'Unit', 'FlowType',
            'Compartment', 'Year'
        ] if e in df.columns.values.tolist()
    ]
    activity_cols = [
        e for e in ['Activity', 'ActivityConsumedBy', 'ActivityProducedBy']
        if e in df.columns.values.tolist()
    ]
    # trim whitespace
    df[sectorcolumn] = df[sectorcolumn].str.strip()
    # to create the denominator dataframe first add a column that captures
    # the sector length
    denom_df = df.assign(sLen=df[sectorcolumn].str.len())
    denom_df = denom_df[denom_df['sLen'] == denom_df.groupby(activity_cols)
                        ['sLen'].transform(min)].drop(columns='sLen')
    denom_df.loc[:, 'Denominator'] = \
        denom_df.groupby(grouping_cols)['HelperFlow'].transform('sum')

    # list of column headers, that if exist in df, should be aggregated
    # using the weighted avg fxn
    possible_column_headers = ('Location', 'LocationSystem', 'Year',
                               'Activity', 'ActivityConsumedBy',
                               'ActivityProducedBy')
    # list of column headers that do exist in the df being aggregated
    column_headers = [
        e for e in possible_column_headers
        if e in denom_df.columns.values.tolist()
    ]
    merge_headers = column_headers.copy()
    column_headers.append('Denominator')
    # create subset of denominator values based on Locations and Activities
    denom_df_2 = \
        denom_df[column_headers].drop_duplicates().reset_index(drop=True)
    # merge the denominator column with fba_w_sector df
    allocation_df = df.merge(denom_df_2,
                             how='left',
                             left_on=merge_headers,
                             right_on=merge_headers)
    # calculate ratio
    allocation_df.loc[:, 'FlowAmountRatio'] = \
        allocation_df['HelperFlow'] / allocation_df['Denominator']
    allocation_df = allocation_df.drop(columns=['Denominator']).reset_index(
        drop=True)
    # where parent NAICS are not found in the allocation dataset, make sure
    # those child NAICS are not dropped
    allocation_df['FlowAmountRatio'] = \
        allocation_df['FlowAmountRatio'].fillna(1)
    # fill empty cols with NoneType
    allocation_df = replace_strings_with_NoneType(allocation_df)
    # fill na values with 0
    allocation_df['HelperFlow'] = allocation_df['HelperFlow'].fillna(0)

    return allocation_df
Esempio n. 15
0
def disaggregate_cropland(fba_w_sector, attr, method, year, sector_column):
    """
    In the event there are 4 (or 5) digit naics for cropland
    at the county level, use state level harvested cropland to
    create ratios
    :param fba_w_sector: df, CoA cropland data, FBA format with sector columns
    :param attr: dictionary, attribute data from method yaml for activity set
    :param year: str, year of data
    :param sector_column: str, the sector column on which to make
                          df modifications (SectorProducedBy or SectorConsumedBy)
    :param attr: dictionary, attribute data from method yaml for activity set
    :return: df, CoA cropland data disaggregated
    """

    # tmp drop NoneTypes
    fba_w_sector = replace_NoneType_with_empty_cells(fba_w_sector)

    # drop pastureland data
    crop = fba_w_sector.loc[fba_w_sector[sector_column].apply(lambda x: x[0:3])
                            != '112'].reset_index(drop=True)
    # drop sectors < 4 digits
    crop = crop[crop[sector_column].apply(lambda x: len(x) > 3)].reset_index(
        drop=True)
    # create tmp location
    crop = crop.assign(Location_tmp=crop['Location'].apply(lambda x: x[0:2]))

    # load the relevant state level harvested cropland by naics
    naics = load_fba_w_standardized_units(datasource="USDA_CoA_Cropland_NAICS",
                                          year=year,
                                          flowclass='Land')
    # subset the harvested cropland by naics
    naics = naics[naics['FlowName'] ==
                  'AG LAND, CROPLAND, HARVESTED'].reset_index(drop=True)
    # drop the activities that include '&'
    naics = naics[~naics['ActivityConsumedBy'].str.contains('&')].reset_index(
        drop=True)
    # add sectors
    naics = add_sectors_to_flowbyactivity(
        naics, sectorsourcename=method['target_sector_source'])
    # estimate suppressed data by equally allocating parent to child naics
    naics = estimate_suppressed_data(naics, 'SectorConsumedBy', 3,
                                     'USDA_CoA_Cropland_NAICS')
    # add missing fbs fields
    naics = clean_df(naics, flow_by_sector_fields, fbs_fill_na_dict)

    # aggregate sectors to create any missing naics levels
    group_cols = fbs_default_grouping_fields
    # group_cols = [e for e in group_cols if e not in ('SectorProducedBy', 'SectorConsumedBy')]
    # group_cols.append(sector_column)
    naics2 = sector_aggregation(naics, group_cols)
    # add missing naics5/6 when only one naics5/6 associated with a naics4
    naics3 = sector_disaggregation(naics2)
    # drop rows where FlowAmount 0
    # naics3 = naics3[~((naics3['SectorProducedBy'] == '') & (naics3['SectorConsumedBy'] == ''))]
    naics3 = naics3.loc[naics3['FlowAmount'] != 0]
    # create ratios
    naics4 = sector_ratios(naics3, sector_column)
    # create temporary sector column to match the two dfs on
    naics4 = naics4.assign(
        Location_tmp=naics4['Location'].apply(lambda x: x[0:2]))
    # tmp drop Nonetypes
    naics4 = replace_NoneType_with_empty_cells(naics4)

    # check units in prep for merge
    compare_df_units(crop, naics4)
    # for loop through naics lengths to determine naics 4 and 5 digits to disaggregate
    for i in range(4, 6):
        # subset df to sectors with length = i and length = i + 1
        crop_subset = crop.loc[crop[sector_column].apply(
            lambda x: i + 1 >= len(x) >= i)]
        crop_subset = crop_subset.assign(
            Sector_tmp=crop_subset[sector_column].apply(lambda x: x[0:i]))
        # if duplicates drop all rows
        df = crop_subset.drop_duplicates(subset=['Location', 'Sector_tmp'],
                                         keep=False).reset_index(drop=True)
        # drop sector temp column
        df = df.drop(columns=["Sector_tmp"])
        # subset df to keep the sectors of length i
        df_subset = df.loc[df[sector_column].apply(lambda x: len(x) == i)]
        # subset the naics df where naics length is i + 1
        naics_subset = \
            naics4.loc[naics4[sector_column].apply(lambda x:
                                                   len(x) == i + 1)].reset_index(drop=True)
        naics_subset = naics_subset.assign(
            Sector_tmp=naics_subset[sector_column].apply(lambda x: x[0:i]))
        # merge the two df based on locations
        df_subset = pd.merge(df_subset,
                             naics_subset[[
                                 sector_column, 'FlowAmountRatio',
                                 'Sector_tmp', 'Location_tmp'
                             ]],
                             how='left',
                             left_on=[sector_column, 'Location_tmp'],
                             right_on=['Sector_tmp', 'Location_tmp'])
        # create flow amounts for the new NAICS based on the flow ratio
        df_subset.loc[:, 'FlowAmount'] = df_subset['FlowAmount'] * df_subset[
            'FlowAmountRatio']
        # drop rows of 0 and na
        df_subset = df_subset[df_subset['FlowAmount'] != 0]
        df_subset = df_subset[~df_subset['FlowAmount'].isna()].reset_index(
            drop=True)
        # drop columns
        df_subset = df_subset.drop(
            columns=[sector_column + '_x', 'FlowAmountRatio', 'Sector_tmp'])
        # rename columns
        df_subset = df_subset.rename(
            columns={sector_column + '_y': sector_column})
        # tmp drop Nonetypes
        df_subset = replace_NoneType_with_empty_cells(df_subset)
        # add new rows of data to crop df
        crop = pd.concat([crop, df_subset], sort=True).reset_index(drop=True)

    # clean up df
    crop = crop.drop(columns=['Location_tmp'])

    # equally allocate any further missing naics
    crop = allocate_dropped_sector_data(crop, 'NAICS_6')

    # pasture data
    pasture = \
        fba_w_sector.loc[fba_w_sector[sector_column].apply(lambda x:
                                                           x[0:3]) == '112'].reset_index(drop=True)
    # concat crop and pasture
    fba_w_sector = pd.concat([pasture, crop], sort=True).reset_index(drop=True)

    # fill empty cells with NoneType
    fba_w_sector = replace_strings_with_NoneType(fba_w_sector)

    return fba_w_sector
Esempio n. 16
0
def compare_activity_to_sector_flowamounts(fba_load, fbs_load, activity_set,
                                           source_name, config):
    """
    Function to compare the loaded flowbyactivity with the final flowbysector
    by activityname (if exists) to target sector level
    output, checking for data loss
    :param fba_load: df, FBA loaded and mapped using FEDEFL
    :param fbs_load: df, final FBS df
    :param activity_set: str, activity set
    :param source_name: str, source name
    :param config: dictionary, method yaml
    :return: printout data differences between loaded FBA and FBS output,
             save results as csv in local directory
    """
    if check_activities_sector_like(source_name):
        vLog.debug('Not comparing loaded FlowByActivity to FlowBySector '
                   'ratios for a dataset with sector-like activities because '
                   'if there are modifications to flowamounts for a sector, '
                   'then the ratios will be different')
    else:
        # subset fba df
        fba = fba_load[[
            'Class', 'MetaSources', 'Flowable', 'Unit', 'FlowType',
            'ActivityProducedBy', 'ActivityConsumedBy', 'Context', 'Location',
            'LocationSystem', 'Year', 'FlowAmount'
        ]].drop_duplicates().reset_index(drop=True)
        fba.loc[:, 'Location'] = US_FIPS
        group_cols = [
            'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit',
            'FlowType', 'Context', 'Location', 'LocationSystem', 'Year'
        ]
        fba_agg = aggregator(fba, group_cols)
        fba_agg.rename(columns={'FlowAmount': 'FBA_amount'}, inplace=True)

        # subset fbs df

        fbs = fbs_load[[
            'Class', 'SectorSourceName', 'Flowable', 'Unit', 'FlowType',
            'SectorProducedBy', 'SectorConsumedBy', 'ActivityProducedBy',
            'ActivityConsumedBy', 'Context', 'Location', 'LocationSystem',
            'Year', 'FlowAmount'
        ]].drop_duplicates().reset_index(drop=True)

        fbs = replace_NoneType_with_empty_cells(fbs)

        fbs['ProducedLength'] = fbs['SectorProducedBy'].str.len()
        fbs['ConsumedLength'] = fbs['SectorConsumedBy'].str.len()
        fbs['SectorLength'] = fbs[['ProducedLength',
                                   'ConsumedLength']].max(axis=1)
        fbs.loc[:, 'Location'] = US_FIPS
        group_cols = [
            'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit',
            'FlowType', 'Context', 'Location', 'LocationSystem', 'Year',
            'SectorLength'
        ]
        fbs_agg = aggregator(fbs, group_cols)
        fbs_agg.rename(columns={'FlowAmount': 'FBS_amount'}, inplace=True)

        # merge compare 1 and compare 2
        df_merge = fba_agg.merge(fbs_agg,
                                 left_on=[
                                     'ActivityProducedBy',
                                     'ActivityConsumedBy', 'Flowable', 'Unit',
                                     'FlowType', 'Context', 'Location',
                                     'LocationSystem', 'Year'
                                 ],
                                 right_on=[
                                     'ActivityProducedBy',
                                     'ActivityConsumedBy', 'Flowable', 'Unit',
                                     'FlowType', 'Context', 'Location',
                                     'LocationSystem', 'Year'
                                 ],
                                 how='left')
        df_merge['Ratio'] = df_merge['FBS_amount'] / df_merge['FBA_amount']

        # reorder
        df_merge = df_merge[[
            'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit',
            'FlowType', 'Context', 'Location', 'LocationSystem', 'Year',
            'SectorLength', 'FBA_amount', 'FBS_amount', 'Ratio'
        ]]

        # keep onlyrows of specified sector length
        comparison = df_merge[df_merge['SectorLength'] == sector_level_key[
            config['target_sector_level']]].reset_index(drop=True)

        tolerance = 0.01
        comparison2 = comparison[(comparison['Ratio'] < 1 - tolerance) |
                                 (comparison['Ratio'] > 1 + tolerance)]

        if len(comparison2) > 0:
            vLog.info(
                'There are %s combinations of flowable/context/sector '
                'length where the flowbyactivity to flowbysector ratio '
                'is less than or greater than 1 by %s', len(comparison2),
                str(tolerance))

        # include df subset in the validation log
        # only print rows where flowamount ratio is less t
        # han 1 (round flowamountratio)
        df_v = comparison2[comparison2['Ratio'].apply(
            lambda x: round(x, 3) < 1)].reset_index(drop=True)

        # save to validation log
        log.info(
            'Save the comparison of FlowByActivity load '
            'to FlowBySector ratios for %s in validation log', activity_set)
        # if df not empty, print, if empty, print string
        if df_v.empty:
            vLogDetailed.info('Ratios for %s all round to 1', activity_set)
        else:
            vLogDetailed.info(
                'Comparison of FlowByActivity load to '
                'FlowBySector ratios for %s: '
                '\n {}'.format(df_v.to_string()), activity_set)
Esempio n. 17
0
def check_for_missing_sector_data(df, target_sector_level):
    """
    Modeled after validation.py check_if_losing_sector_data
    Allocates flow amount equally across child NAICS when parent NAICS
    is not target_level
    :param df: df
    :param target_sector_level: str, final sector level of FBS (ex. NAICS_6)
    :return: df with missing sector level data
    """

    from flowsa.dataclean import replace_NoneType_with_empty_cells
    from flowsa.dataclean import replace_strings_with_NoneType

    # temporarily replace null values with empty cells
    df = replace_NoneType_with_empty_cells(df)

    activity_field = "SectorProducedBy"
    rows_lost = pd.DataFrame()
    cw_load = load_crosswalk('sector_length')
    for i in range(3, sector_level_key[target_sector_level]):
        # create df of i length
        df_subset = df.loc[df[activity_field].apply(lambda x: len(x) == i)]

        # import cw and subset to current sector length and
        # target sector length

        nlength = list(sector_level_key.keys())[list(
            sector_level_key.values()).index(i)]
        cw = cw_load[[nlength, target_sector_level]].drop_duplicates()
        # add column with counts
        cw['sector_count'] = cw.groupby(nlength)[nlength].transform('count')

        # merge df & replace sector produced columns
        df_x = pd.merge(df_subset,
                        cw,
                        how='left',
                        left_on=[activity_field],
                        right_on=[nlength])
        df_x[activity_field] = df_x[target_sector_level]
        df_x = df_x.drop(columns=[nlength, target_sector_level])

        # calculate new flow amounts, based on sector count,
        # allocating equally to the new sector length codes
        df_x['FlowAmount'] = df_x['FlowAmount'] / df_x['sector_count']
        df_x = df_x.drop(columns=['sector_count'])
        # replace null values with empty cells
        df_x = replace_NoneType_with_empty_cells(df_x)

        # append to df
        sector_list = df_subset[activity_field].drop_duplicates()
        if len(df_x) != 0:
            log.warning(
                'Data found at %s digit NAICS to be allocated: '
                '{}'.format(' '.join(map(str, sector_list))), str(i))
            rows_lost = rows_lost.append(df_x, ignore_index=True, sort=True)

    if len(rows_lost) == 0:
        log.info('No data loss from NAICS in dataframe')
    else:
        log.info('Allocating FlowAmounts equally to each %s',
                 target_sector_level)

    # add rows of missing data to the fbs sector subset
    df_allocated = pd.concat([df, rows_lost], ignore_index=True, sort=True)
    df_allocated = df_allocated.loc[df_allocated[activity_field].apply(
        lambda x: len(x) == sector_level_key[target_sector_level])]
    df_allocated.reset_index(inplace=True)

    # replace empty cells with NoneType (if dtype is object)
    df_allocated = replace_strings_with_NoneType(df_allocated)

    return df_allocated
Esempio n. 18
0
def replace_naics_w_naics_from_another_year(df_load, sectorsourcename):
    """
    Replace any non sectors with sectors.
    :param df_load: df with sector columns or sector-like activities
    :param sectorsourcename: str, sector source name (ex. NAICS_2012_Code)
    :return: df, with non-sectors replaced with sectors
    """
    # from flowsa.flowbyfunctions import aggregator

    # drop NoneType
    df = replace_NoneType_with_empty_cells(df_load).reset_index(drop=True)

    # load the mastercroswalk and subset by sectorsourcename,
    # save values to list
    cw_load = load_crosswalk('sector_timeseries')
    cw = cw_load[sectorsourcename].drop_duplicates().tolist()

    # load melted crosswalk
    cw_melt = melt_naics_crosswalk()
    # drop the count column
    cw_melt = cw_melt.drop(columns='naics_count')

    # determine which headers are in the df
    if 'SectorConsumedBy' in df:
        column_headers = ['SectorProducedBy', 'SectorConsumedBy']
    else:
        column_headers = ['ActivityProducedBy', 'ActivityConsumedBy']

    # check if there are any sectors that are not in the naics 2012 crosswalk
    non_naics = check_if_sectors_are_naics(df, cw, column_headers)

    # loop through the df headers and determine if value is
    # not in crosswalk list
    if len(non_naics) != 0:
        vLog.debug(
            'Checking if sectors represent a different '
            'NAICS year, if so, replace with %s', sectorsourcename)
        for c in column_headers:
            # merge df with the melted sector crosswalk
            df = df.merge(cw_melt, left_on=c, right_on='NAICS', how='left')
            # if there is a value in the sectorsourcename column,
            # use that value to replace sector in column c if value in
            # column c is in the non_naics list
            df[c] = np.where((df[c] == df['NAICS']) & (df[c].isin(non_naics)),
                             df[sectorsourcename], df[c])
            # multiply the FlowAmount col by allocation_ratio
            df.loc[df[c] == df[sectorsourcename],
                   'FlowAmount'] = df['FlowAmount'] * df['allocation_ratio']
            # drop columns
            df = df.drop(
                columns=[sectorsourcename, 'NAICS', 'allocation_ratio'])
        vLog.debug('Replaced NAICS with %s', sectorsourcename)

        # check if there are any sectors that are not in
        # the naics 2012 crosswalk
        vLog.debug('Check again for non NAICS 2012 Codes')
        nonsectors = check_if_sectors_are_naics(df, cw, column_headers)
        if len(nonsectors) != 0:
            vLog.debug('Dropping non-NAICS from dataframe')
            for c in column_headers:
                # drop rows where column value is in the nonnaics list
                df = df[~df[c].isin(nonsectors)]
        # aggregate data
        possible_column_headers = \
            ('FlowAmount', 'Spread', 'Min', 'Max', 'DataReliability',
             'TemporalCorrelation', 'GeographicalCorrelation',
             'TechnologicalCorrelation', 'DataCollection', 'Description')
        # list of column headers to group aggregation by
        groupby_cols = [
            e for e in df.columns.values.tolist()
            if e not in possible_column_headers
        ]
        df = aggregator(df, groupby_cols)

    # drop rows where both SectorConsumedBy and SectorProducedBy NoneType
    if 'SectorConsumedBy' in df:
        df_drop = df[(df['SectorConsumedBy'].isnull())
                     & (df['SectorProducedBy'].isnull())]
        if len(df_drop) != 0:
            activities_dropped = pd.unique(
                df_drop[['ActivityConsumedBy',
                         'ActivityProducedBy']].values.ravel('K'))
            activities_dropped = list(
                filter(lambda x: x is not None, activities_dropped))
            vLog.debug('Dropping rows where the Activity columns contain %s',
                       ', '.join(activities_dropped))
        df = df[~((df['SectorConsumedBy'].isnull()) &
                  (df['SectorProducedBy'].isnull()))].reset_index(drop=True)
    else:
        df = df[~((df['ActivityConsumedBy'].isnull()) &
                  (df['ActivityProducedBy'].isnull()))].reset_index(drop=True)

    df = replace_strings_with_NoneType(df)

    return df
Esempio n. 19
0
def sector_aggregation(df_load, group_cols):
    """
    Function that checks if a sector length exists, and if not,
    sums the less aggregated sector
    :param df_load: Either a flowbyactivity df with sectors or
       a flowbysector df
    :param group_cols: columns by which to aggregate
    :return: df, with aggregated sector values
    """
    # ensure None values are not strings
    df = replace_NoneType_with_empty_cells(df_load)

    # determine if activities are sector-like,
    # if aggregating a df with a 'SourceName'
    sector_like_activities = False
    if 'SourceName' in df_load.columns:
        s = pd.unique(df_load['SourceName'])[0]
        sector_like_activities = check_activities_sector_like(s)

    # if activities are source like, drop from df and group calls,
    # add back in as copies of sector columns columns to keep
    if sector_like_activities:
        group_cols = [
            e for e in group_cols
            if e not in ('ActivityProducedBy', 'ActivityConsumedBy')
        ]
        # subset df
        df_cols = [
            e for e in df.columns
            if e not in ('ActivityProducedBy', 'ActivityConsumedBy')
        ]
        df = df[df_cols]

    # find the longest length sector
    length = df[[fbs_activity_fields[0], fbs_activity_fields[1]
                 ]].apply(lambda x: x.str.len()).max().max()
    length = int(length)
    # for loop in reverse order longest length naics minus 1 to 2
    # appends missing naics levels to df
    for i in range(length, 2, -1):
        # df where either sector column is length or both columns are
        df1 = df[((df['SectorProducedBy'].apply(lambda x: len(x) == i)) |
                  (df['SectorConsumedBy'].apply(lambda x: len(x) == i)))
                 | ((df['SectorProducedBy'].apply(lambda x: len(x) == i)) &
                    (df['SectorConsumedBy'].apply(lambda x: len(x) == i)))]

        # add new columns dropping last digit of sectors
        df1 = df1.assign(
            SPB=df1['SectorProducedBy'].apply(lambda x: x[0:i - 1]))
        df1 = df1.assign(
            SCB=df1['SectorConsumedBy'].apply(lambda x: x[0:i - 1]))

        # second dataframe where length is l - 1
        df2 = df[((df['SectorProducedBy'].apply(lambda x: len(x) == i - 1)) |
                  (df['SectorConsumedBy'].apply(lambda x: len(x) == i - 1)))
                 | ((df['SectorProducedBy'].apply(lambda x: len(x) == i - 1)) &
                    (df['SectorConsumedBy'].apply(lambda x: len(x) == i - 1))
                    )].rename(columns={
                        'SectorProducedBy': 'SPB',
                        'SectorConsumedBy': 'SCB'
                    })

        # merge the dfs
        merge_cols = [col for col in df2.columns if hasattr(df2[col], 'str')]
        # also drop activity and description cols
        merge_cols = [
            c for c in merge_cols if c not in
            ['ActivityConsumedBy', 'ActivityProducedBy', 'Description']
        ]

        if len(df2) > 0:
            dfm = df1.merge(df2[merge_cols],
                            how='outer',
                            on=merge_cols,
                            indicator=True).query('_merge=="left_only"').drop(
                                '_merge', axis=1)
        else:
            dfm = df1.copy(deep=True)

        if len(dfm) > 0:
            # replace the SCB and SPB columns then aggregate and add to df
            dfm['SectorProducedBy'] = dfm['SPB']
            dfm['SectorConsumedBy'] = dfm['SCB']
            dfm = dfm.drop(columns=(['SPB', 'SCB']))
            # aggregate the new sector flow amounts
            agg_sectors = aggregator(dfm, group_cols)
            # append to df
            agg_sectors = replace_NoneType_with_empty_cells(agg_sectors)
            df = df.append(agg_sectors, sort=False).reset_index(drop=True)
    df = df.drop_duplicates()

    # if activities are source-like, set col values as
    # copies of the sector columns
    if sector_like_activities:
        df = df.assign(ActivityProducedBy=df['SectorProducedBy'])
        df = df.assign(ActivityConsumedBy=df['SectorConsumedBy'])
        # reindex columns
        df = df.reindex(df_load.columns, axis=1)

    # replace null values
    df = replace_strings_with_NoneType(df).reset_index(drop=True)

    return df