Example #1
0
def direct_allocation_method(fbs, k, names, method):
    """
    Directly assign activities to sectors
    :param fbs: df, FBA with flows converted using fedelemflowlist
    :param k: str, source name
    :param names: list, activity names in activity set
    :param method: dictionary, FBS method yaml
    :return: df with sector columns
    """
    log.info('Directly assigning activities to sectors')
    # for each activity, if activities are not sector like,
    # check that there is no data loss
    if check_activities_sector_like(k) is False:
        activity_list = []
        n_allocated = []
        for n in names:
            # avoid double counting by dropping n from the df after calling on
            # n, in the event both ACB and APB values exist
            fbs = fbs[~(
                (fbs[fba_activity_fields[0]].isin(n_allocated)) |
                (fbs[fba_activity_fields[1]].isin(n_allocated)))].reset_index(
                    drop=True)
            log.debug('Checking for %s at %s', n,
                      method['target_sector_level'])
            fbs_subset = \
                fbs[(fbs[fba_activity_fields[0]] == n) |
                    (fbs[fba_activity_fields[1]] == n)].reset_index(drop=True)
            # check if an Activity maps to more than one sector,
            # if so, equally allocate
            fbs_subset = equal_allocation(fbs_subset)
            fbs_subset = equally_allocate_parent_to_child_naics(
                fbs_subset, method['target_sector_level'])
            activity_list.append(fbs_subset)
            n_allocated.append(n)
        fbs = pd.concat(activity_list, ignore_index=True)
    return fbs
Example #2
0
def compare_fba_geo_subset_and_fbs_output_totals(fba_load, fbs_load,
                                                 activity_set, source_name,
                                                 source_attr, activity_attr,
                                                 method):
    """
    Function to compare the loaded flowbyactivity total after
    subsetting by activity and geography with the final flowbysector output
    total. Not a direct comparison of the loaded FBA because FBAs are
    modified before being subset by activity for the target sector level
    :param fba_load: df, FBA loaded, before being mapped
    :param fbs_load: df, final FBS df at target sector level
    :param activity_set: str, activity set
    :param source_name: str, source name
    :param source_attr: dictionary, attribute data from method yaml
        for source data
    :param activity_attr: dictionary, attribute data from method yaml
        for activity set
    :param method: dictionary, FBS method yaml
    :return: printout data differences between loaded FBA and FBS output
        totals by location, save results as csv in local directory
    """

    vLog.info('Comparing Flow-By-Activity subset by activity and geography to '
              'the subset Flow-By-Sector FlowAmount total.')

    # determine from scale
    if fips_number_key[source_attr['geoscale_to_use']] < \
            fips_number_key[activity_attr['allocation_from_scale']]:
        from_scale = source_attr['geoscale_to_use']
    else:
        from_scale = activity_attr['allocation_from_scale']

    # extract relevant geoscale data or aggregate existing data
    fba = subset_df_by_geoscale(fba_load, from_scale,
                                method['target_geoscale'])
    if check_activities_sector_like(source_name):
        # if activities are sector-like, run sector aggregation and then
        # subset df to only keep NAICS2
        fba = fba[[
            'Class', 'FlowAmount', 'Unit', 'Context', 'ActivityProducedBy',
            'ActivityConsumedBy', 'Location', 'LocationSystem'
        ]]
        # rename the activity cols to sector cols for purposes of aggregation
        fba = fba.rename(
            columns={
                'ActivityProducedBy': 'SectorProducedBy',
                'ActivityConsumedBy': 'SectorConsumedBy'
            })
        group_cols_agg = [
            'Class', 'Context', 'Unit', 'Location', 'LocationSystem',
            'SectorProducedBy', 'SectorConsumedBy'
        ]
        fba = sector_aggregation(fba, group_cols_agg)
        # subset fba to only include NAICS2
        fba = replace_NoneType_with_empty_cells(fba)
        fba = fba[fba['SectorConsumedBy'].apply(lambda x: len(x) == 2)
                  | fba['SectorProducedBy'].apply(lambda x: len(x) == 2)]
    # subset/agg dfs
    col_subset = [
        'Class', 'FlowAmount', 'Unit', 'Context', 'Location', 'LocationSystem'
    ]
    group_cols = ['Class', 'Unit', 'Context', 'Location', 'LocationSystem']
    # check units
    compare_df_units(fba, fbs_load)
    # fba
    fba = fba[col_subset]
    fba_agg = aggregator(fba, group_cols).reset_index(drop=True)
    fba_agg.rename(columns={
        'FlowAmount': 'FBA_amount',
        'Unit': 'FBA_unit'
    },
                   inplace=True)

    # fbs
    fbs = fbs_load[col_subset]
    fbs_agg = aggregator(fbs, group_cols)
    fbs_agg.rename(columns={
        'FlowAmount': 'FBS_amount',
        'Unit': 'FBS_unit'
    },
                   inplace=True)

    try:
        # merge FBA and FBS totals
        df_merge = fba_agg.merge(fbs_agg, how='left')
        df_merge['FlowAmount_difference'] = \
            df_merge['FBA_amount'] - df_merge['FBS_amount']
        df_merge['Percent_difference'] = \
            (df_merge['FlowAmount_difference']/df_merge['FBA_amount']) * 100

        # reorder
        df_merge = df_merge[[
            'Class', 'Context', 'Location', 'LocationSystem', 'FBA_amount',
            'FBA_unit', 'FBS_amount', 'FBS_unit', 'FlowAmount_difference',
            'Percent_difference'
        ]]
        df_merge = replace_NoneType_with_empty_cells(df_merge)

        # list of contexts and locations
        context_list = df_merge[['Context', 'Location']].values.tolist()

        # loop through the contexts and print results of comparison
        vLog.info(
            'Comparing FBA %s %s subset to FBS results. '
            'Details in Validation Log', activity_set,
            source_attr['geoscale_to_use'])
        for i, j in context_list:
            df_merge_subset = \
                df_merge[(df_merge['Context'] == i) &
                         (df_merge['Location'] == j)].reset_index(drop=True)
            diff_per = df_merge_subset['Percent_difference'][0]
            if np.isnan(diff_per):
                vLog.info(
                    'FlowBySector FlowAmount for %s %s %s '
                    'does not exist in the FBS', source_name, activity_set, i)
                continue
            # make reporting more manageable
            if abs(diff_per) > 0.01:
                diff_per = round(diff_per, 2)
            else:
                diff_per = round(diff_per, 6)

            # diff_units = df_merge_subset['FBS_unit'][0]
            if diff_per > 0:
                vLog.info(
                    'FlowBySector FlowAmount for %s %s %s at %s is %s%% '
                    'less than the FlowByActivity FlowAmount', source_name,
                    activity_set, i, j, str(abs(diff_per)))
            elif diff_per < 0:
                vLog.info(
                    'FlowBySector FlowAmount for %s %s %s at %s is %s%% '
                    'more than the FlowByActivity FlowAmount', source_name,
                    activity_set, i, j, str(abs(diff_per)))
            elif diff_per == 0:
                vLogDetailed.info(
                    'FlowBySector FlowAmount for '
                    '%s %s %s at %s is equal to the '
                    'FlowByActivity FlowAmount', source_name, activity_set, i,
                    j)

        # subset the df to include in the validation log
        # only print rows where the percent difference does not round to 0
        df_v = df_merge[df_merge['Percent_difference'].apply(
            lambda x: round(x, 3) != 0)].reset_index(drop=True)

        # log output
        log.info(
            'Save the comparison of FlowByActivity load to FlowBySector '
            'total FlowAmounts for %s in validation log file', activity_set)
        # if df not empty, print, if empty, print string
        if df_v.empty:
            vLogDetailed.info('Percent difference for %s all round to 0',
                              activity_set)
        else:
            vLogDetailed.info(
                'Comparison of FBA load to FBS total '
                'FlowAmounts for %s: '
                '\n {}'.format(df_v.to_string()), activity_set)
    except:
        vLog.info('Error occurred when comparing total FlowAmounts '
                  'for FlowByActivity and FlowBySector')
Example #3
0
def compare_activity_to_sector_flowamounts(fba_load, fbs_load, activity_set,
                                           source_name, config):
    """
    Function to compare the loaded flowbyactivity with the final flowbysector
    by activityname (if exists) to target sector level
    output, checking for data loss
    :param fba_load: df, FBA loaded and mapped using FEDEFL
    :param fbs_load: df, final FBS df
    :param activity_set: str, activity set
    :param source_name: str, source name
    :param config: dictionary, method yaml
    :return: printout data differences between loaded FBA and FBS output,
             save results as csv in local directory
    """
    if check_activities_sector_like(source_name):
        vLog.debug('Not comparing loaded FlowByActivity to FlowBySector '
                   'ratios for a dataset with sector-like activities because '
                   'if there are modifications to flowamounts for a sector, '
                   'then the ratios will be different')
    else:
        # subset fba df
        fba = fba_load[[
            'Class', 'MetaSources', 'Flowable', 'Unit', 'FlowType',
            'ActivityProducedBy', 'ActivityConsumedBy', 'Context', 'Location',
            'LocationSystem', 'Year', 'FlowAmount'
        ]].drop_duplicates().reset_index(drop=True)
        fba.loc[:, 'Location'] = US_FIPS
        group_cols = [
            'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit',
            'FlowType', 'Context', 'Location', 'LocationSystem', 'Year'
        ]
        fba_agg = aggregator(fba, group_cols)
        fba_agg.rename(columns={'FlowAmount': 'FBA_amount'}, inplace=True)

        # subset fbs df

        fbs = fbs_load[[
            'Class', 'SectorSourceName', 'Flowable', 'Unit', 'FlowType',
            'SectorProducedBy', 'SectorConsumedBy', 'ActivityProducedBy',
            'ActivityConsumedBy', 'Context', 'Location', 'LocationSystem',
            'Year', 'FlowAmount'
        ]].drop_duplicates().reset_index(drop=True)

        fbs = replace_NoneType_with_empty_cells(fbs)

        fbs['ProducedLength'] = fbs['SectorProducedBy'].str.len()
        fbs['ConsumedLength'] = fbs['SectorConsumedBy'].str.len()
        fbs['SectorLength'] = fbs[['ProducedLength',
                                   'ConsumedLength']].max(axis=1)
        fbs.loc[:, 'Location'] = US_FIPS
        group_cols = [
            'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit',
            'FlowType', 'Context', 'Location', 'LocationSystem', 'Year',
            'SectorLength'
        ]
        fbs_agg = aggregator(fbs, group_cols)
        fbs_agg.rename(columns={'FlowAmount': 'FBS_amount'}, inplace=True)

        # merge compare 1 and compare 2
        df_merge = fba_agg.merge(fbs_agg,
                                 left_on=[
                                     'ActivityProducedBy',
                                     'ActivityConsumedBy', 'Flowable', 'Unit',
                                     'FlowType', 'Context', 'Location',
                                     'LocationSystem', 'Year'
                                 ],
                                 right_on=[
                                     'ActivityProducedBy',
                                     'ActivityConsumedBy', 'Flowable', 'Unit',
                                     'FlowType', 'Context', 'Location',
                                     'LocationSystem', 'Year'
                                 ],
                                 how='left')
        df_merge['Ratio'] = df_merge['FBS_amount'] / df_merge['FBA_amount']

        # reorder
        df_merge = df_merge[[
            'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit',
            'FlowType', 'Context', 'Location', 'LocationSystem', 'Year',
            'SectorLength', 'FBA_amount', 'FBS_amount', 'Ratio'
        ]]

        # keep onlyrows of specified sector length
        comparison = df_merge[df_merge['SectorLength'] == sector_level_key[
            config['target_sector_level']]].reset_index(drop=True)

        tolerance = 0.01
        comparison2 = comparison[(comparison['Ratio'] < 1 - tolerance) |
                                 (comparison['Ratio'] > 1 + tolerance)]

        if len(comparison2) > 0:
            vLog.info(
                'There are %s combinations of flowable/context/sector '
                'length where the flowbyactivity to flowbysector ratio '
                'is less than or greater than 1 by %s', len(comparison2),
                str(tolerance))

        # include df subset in the validation log
        # only print rows where flowamount ratio is less t
        # han 1 (round flowamountratio)
        df_v = comparison2[comparison2['Ratio'].apply(
            lambda x: round(x, 3) < 1)].reset_index(drop=True)

        # save to validation log
        log.info(
            'Save the comparison of FlowByActivity load '
            'to FlowBySector ratios for %s in validation log', activity_set)
        # if df not empty, print, if empty, print string
        if df_v.empty:
            vLogDetailed.info('Ratios for %s all round to 1', activity_set)
        else:
            vLogDetailed.info(
                'Comparison of FlowByActivity load to '
                'FlowBySector ratios for %s: '
                '\n {}'.format(df_v.to_string()), activity_set)
Example #4
0
def main(**kwargs):
    """
    Creates a flowbysector dataset
    :param kwargs: dictionary of arguments, only argument is
        "method_name", the name of method corresponding to flowbysector
        method yaml name
    :return: parquet, FBS save to local folder
    """
    if len(kwargs) == 0:
        kwargs = parse_args()

    method_name = kwargs['method']
    download_FBA_if_missing = kwargs.get('download_FBAs_if_missing')
    # assign arguments
    vLog.info("Initiating flowbysector creation for %s", method_name)
    # call on method
    method = load_yaml_dict(method_name, flowbytype='FBS')
    # create dictionary of data and allocation datasets
    fb = method['source_names']
    # Create empty list for storing fbs files
    fbs_list = []
    for k, v in fb.items():
        # pull fba data for allocation
        flows = load_source_dataframe(k, v, download_FBA_if_missing)

        if v['data_format'] == 'FBA':
            # ensure correct datatypes and that all fields exist
            flows = clean_df(flows,
                             flow_by_activity_fields,
                             fba_fill_na_dict,
                             drop_description=False)

            # clean up fba before mapping, if specified in yaml
            if "clean_fba_before_mapping_df_fxn" in v:
                vLog.info("Cleaning up %s FlowByActivity", k)
                flows = dynamically_import_fxn(
                    k, v["clean_fba_before_mapping_df_fxn"])(flows)

            # map flows to federal flow list or material flow list
            flows_mapped, mapping_files = \
                map_fbs_flows(flows, k, v, keep_fba_columns=True)

            # clean up fba, if specified in yaml
            if "clean_fba_df_fxn" in v:
                vLog.info("Cleaning up %s FlowByActivity", k)
                flows_mapped = dynamically_import_fxn(
                    k, v["clean_fba_df_fxn"])(flows_mapped)

            # if activity_sets are specified in a file, call them here
            if 'activity_set_file' in v:
                aset_names = pd.read_csv(flowbysectoractivitysetspath +
                                         v['activity_set_file'],
                                         dtype=str)
            else:
                aset_names = None

            # master list of activity names read in from data source
            ml_act = []
            # create dictionary of allocation datasets for different activities
            activities = v['activity_sets']
            # subset activity data and allocate to sector
            for aset, attr in activities.items():
                # subset by named activities
                if 'activity_set_file' in v:
                    names = \
                        aset_names[aset_names['activity_set'] == aset]['name']
                else:
                    names = attr['names']

                # to avoid double counting data from the same source, in
                # the event there are values in both the APB and ACB
                # columns, if an activity has already been read in and
                # allocated, remove that activity from the mapped flows
                # regardless of what activity set the data was read in
                flows_mapped = flows_mapped[~(
                    (flows_mapped[fba_activity_fields[0]].isin(ml_act)) |
                    (flows_mapped[fba_activity_fields[1]].isin(ml_act))
                )].reset_index(drop=True)
                ml_act.extend(names)

                vLog.info("Preparing to handle %s in %s", aset, k)
                # subset fba data by activity
                flows_subset = flows_mapped[
                    (flows_mapped[fba_activity_fields[0]].isin(names)) |
                    (flows_mapped[fba_activity_fields[1]].isin(names)
                     )].reset_index(drop=True)

                # subset by flowname if exists
                if 'source_flows' in attr:
                    flows_subset = flows_subset[flows_subset['FlowName'].isin(
                        attr['source_flows'])]
                if len(flows_subset) == 0:
                    log.warning(f"no data found for flows in {aset}")
                    continue
                if len(flows_subset[flows_subset['FlowAmount'] != 0]) == 0:
                    log.warning(f"all flow data for {aset} is 0")
                    continue
                # if activities are sector-like, check sectors are valid
                if check_activities_sector_like(k):
                    flows_subset2 = replace_naics_w_naics_from_another_year(
                        flows_subset, method['target_sector_source'])

                    # check impact on df FlowAmounts
                    vLog.info(
                        'Calculate FlowAmount difference caused by '
                        'replacing NAICS Codes with %s, saving '
                        'difference in Validation log',
                        method['target_sector_source'],
                    )
                    calculate_flowamount_diff_between_dfs(
                        flows_subset, flows_subset2)
                else:
                    flows_subset2 = flows_subset.copy()

                # extract relevant geoscale data or aggregate existing data
                flows_subset_geo = subset_df_by_geoscale(
                    flows_subset2, v['geoscale_to_use'],
                    attr['allocation_from_scale'])
                # if loading data subnational geoscale, check for data loss
                if attr['allocation_from_scale'] != 'national':
                    compare_geographic_totals(flows_subset_geo, flows_mapped,
                                              k, attr, aset, names)

                # Add sectors to df activity, depending on level
                # of specified sector aggregation
                log.info("Adding sectors to %s", k)
                flows_subset_wsec = add_sectors_to_flowbyactivity(
                    flows_subset_geo,
                    sectorsourcename=method['target_sector_source'],
                    allocationmethod=attr['allocation_method'])
                # clean up fba with sectors, if specified in yaml
                if "clean_fba_w_sec_df_fxn" in v:
                    vLog.info("Cleaning up %s FlowByActivity with sectors", k)
                    flows_subset_wsec = dynamically_import_fxn(
                        k, v["clean_fba_w_sec_df_fxn"])(flows_subset_wsec,
                                                        attr=attr,
                                                        method=method)

                # rename SourceName to MetaSources and drop columns
                flows_mapped_wsec = flows_subset_wsec.\
                    rename(columns={'SourceName': 'MetaSources'}).\
                    drop(columns=['FlowName', 'Compartment'])

                # if allocation method is "direct", then no need
                # to create alloc ratios, else need to use allocation
                # dataframe to create sector allocation ratios
                if attr['allocation_method'] == 'direct':
                    fbs = direct_allocation_method(flows_mapped_wsec, k, names,
                                                   method)
                # if allocation method for an activity set requires a specific
                # function due to the complicated nature
                # of the allocation, call on function here
                elif attr['allocation_method'] == 'allocation_function':
                    fbs = function_allocation_method(flows_mapped_wsec, k,
                                                     names, attr, fbs_list)
                else:
                    fbs = dataset_allocation_method(flows_mapped_wsec, attr,
                                                    names, method, k, v, aset,
                                                    aset_names,
                                                    download_FBA_if_missing)

                # drop rows where flowamount = 0
                # (although this includes dropping suppressed data)
                fbs = fbs[fbs['FlowAmount'] != 0].reset_index(drop=True)

                # define grouping columns dependent on sectors
                # being activity-like or not
                if check_activities_sector_like(k) is False:
                    groupingcols = fbs_grouping_fields_w_activities
                    groupingdict = flow_by_sector_fields_w_activity
                else:
                    groupingcols = fbs_default_grouping_fields
                    groupingdict = flow_by_sector_fields

                # clean df
                fbs = clean_df(fbs, groupingdict, fbs_fill_na_dict)

                # aggregate df geographically, if necessary
                log.info("Aggregating flowbysector to %s level",
                         method['target_geoscale'])
                # determine from scale
                if fips_number_key[v['geoscale_to_use']] <\
                        fips_number_key[attr['allocation_from_scale']]:
                    from_scale = v['geoscale_to_use']
                else:
                    from_scale = attr['allocation_from_scale']

                fbs_geo_agg = agg_by_geoscale(fbs, from_scale,
                                              method['target_geoscale'],
                                              groupingcols)

                # aggregate data to every sector level
                log.info("Aggregating flowbysector to all sector levels")
                fbs_sec_agg = sector_aggregation(fbs_geo_agg, groupingcols)
                # add missing naics5/6 when only one naics5/6
                # associated with a naics4
                fbs_agg = sector_disaggregation(fbs_sec_agg)

                # check if any sector information is lost before reaching
                # the target sector length, if so,
                # allocate values equally to disaggregated sectors
                vLog.info(
                    'Searching for and allocating FlowAmounts for any parent '
                    'NAICS that were dropped in the subset to '
                    '%s child NAICS', method['target_sector_level'])
                fbs_agg_2 = equally_allocate_parent_to_child_naics(
                    fbs_agg, method['target_sector_level'])

                # compare flowbysector with flowbyactivity
                compare_activity_to_sector_flowamounts(flows_mapped_wsec,
                                                       fbs_agg_2, aset, k,
                                                       method)

                # return sector level specified in method yaml
                # load the crosswalk linking sector lengths
                sector_list = get_sector_list(method['target_sector_level'])

                # subset df, necessary because not all of the sectors are
                # NAICS and can get duplicate rows
                fbs_1 = fbs_agg_2.loc[
                    (fbs_agg_2[fbs_activity_fields[0]].isin(sector_list)) &
                    (fbs_agg_2[fbs_activity_fields[1]].isin(sector_list))].\
                    reset_index(drop=True)
                fbs_2 = fbs_agg_2.loc[
                    (fbs_agg_2[fbs_activity_fields[0]].isin(sector_list)) &
                    (fbs_agg_2[fbs_activity_fields[1]].isnull())].\
                    reset_index(drop=True)
                fbs_3 = fbs_agg_2.loc[
                    (fbs_agg_2[fbs_activity_fields[0]].isnull()) &
                    (fbs_agg_2[fbs_activity_fields[1]].isin(sector_list))].\
                    reset_index(drop=True)
                fbs_sector_subset = pd.concat([fbs_1, fbs_2, fbs_3])

                # drop activity columns
                fbs_sector_subset = fbs_sector_subset.drop(
                    ['ActivityProducedBy', 'ActivityConsumedBy'],
                    axis=1,
                    errors='ignore')

                # save comparison of FBA total to FBS total for an activity set
                compare_fba_geo_subset_and_fbs_output_totals(
                    flows_subset_geo, fbs_sector_subset, aset, k, v, attr,
                    method)

                log.info("Completed flowbysector for %s", aset)
                fbs_list.append(fbs_sector_subset)
        else:
            if 'clean_fbs_df_fxn' in v:
                flows = dynamically_import_fxn(v["clean_fbs_df_fxn_source"],
                                               v["clean_fbs_df_fxn"])(flows)
            flows = update_geoscale(flows, method['target_geoscale'])
            # if the loaded flow dt is already in FBS format,
            # append directly to list of FBS
            log.info("Append %s to FBS list", k)
            # ensure correct field datatypes and add any missing fields
            flows = clean_df(flows, flow_by_sector_fields, fbs_fill_na_dict)
            fbs_list.append(flows)
    # create single df of all activities
    log.info("Concat data for all activities")
    fbss = pd.concat(fbs_list, ignore_index=True, sort=False)
    log.info("Clean final dataframe")
    # add missing fields, ensure correct data type,
    # add missing columns, reorder columns
    fbss = clean_df(fbss, flow_by_sector_fields, fbs_fill_na_dict)
    # prior to aggregating, replace MetaSources string with all sources
    # that share context/flowable/sector values
    fbss = harmonize_FBS_columns(fbss)
    # aggregate df as activities might have data for
    # the same specified sector length
    fbss = aggregator(fbss, fbs_default_grouping_fields)
    # sort df
    log.info("Sort and store dataframe")
    # ensure correct data types/order of columns
    fbss = clean_df(fbss, flow_by_sector_fields, fbs_fill_na_dict)
    fbss = fbss.sort_values(
        ['SectorProducedBy', 'SectorConsumedBy', 'Flowable',
         'Context']).reset_index(drop=True)
    # check for negative flow amounts
    check_for_negative_flowamounts(fbss)
    # tmp reset data quality scores
    fbss = reset_fbs_dq_scores(fbss)
    # save parquet file
    meta = set_fb_meta(method_name, "FlowBySector")
    write_df_to_file(fbss, paths, meta)
    write_metadata(method_name, method, meta, "FlowBySector")
    # rename the log file saved to local directory
    rename_log_file(method_name, meta)
    log.info(
        'See the Validation log for detailed assessment of '
        'model results in %s', logoutputpath)
Example #5
0
def equally_allocate_suppressed_parent_to_child_naics(df_load, sector_column,
                                                      groupcols):
    """
    Estimate data suppression, by equally allocating parent NAICS
    values to child NAICS
    :param df_load: df with sector columns
    :param sector_column: str, column to estimate suppressed data for
    :param groupcols: list, columns to group df by
    :return: df, with estimated suppressed data
    """
    df = sector_disaggregation(df_load)
    df = replace_NoneType_with_empty_cells(df)
    df = df[df[sector_column] != '']

    # determine if activities are sector-like,
    # if aggregating a df with a 'SourceName'
    sector_like_activities = False
    if 'SourceName' in df_load.columns:
        s = pd.unique(df_load['SourceName'])[0]
        sector_like_activities = check_activities_sector_like(s)

    # if activities are source like, drop from df,
    # add back in as copies of sector columns columns to keep
    if sector_like_activities:
        # subset df
        df_cols = [
            e for e in df.columns
            if e not in ('ActivityProducedBy', 'ActivityConsumedBy')
        ]
        df = df[df_cols]
        # drop activity from groupby
        groupcols = [
            e for e in groupcols if e not in
            ['ActivityConsumedBy', 'ActivityProducedBy', 'Description']
        ]

    # load naics 2 to naics 6 crosswalk
    cw_load = load_crosswalk('sector_length')
    cw_melt = cw_load.melt(id_vars=["NAICS_6"],
                           var_name="NAICS_Length",
                           value_name="NAICS_Match").drop(
                               columns=['NAICS_Length']).drop_duplicates()

    df_sup = df[df['FlowAmount'] == 0].reset_index(drop=True)
    # merge the naics cw
    new_naics = pd.merge(df_sup,
                         cw_melt,
                         how='left',
                         left_on=[sector_column],
                         right_on=['NAICS_Match'])
    # drop rows where match is null because no additional naics to add
    new_naics = new_naics.dropna()
    new_naics[sector_column] = new_naics['NAICS_6'].copy()
    new_naics = new_naics.drop(columns=['NAICS_6', 'NAICS_Match'])

    # merge the new naics with the existing df, if data already
    # existed for a NAICS6, keep the original
    dfm = pd.merge(new_naics[groupcols],
                   df,
                   how='left',
                   on=groupcols,
                   indicator=True).query('_merge=="left_only"').drop('_merge',
                                                                     axis=1)
    dfm = replace_NoneType_with_empty_cells(dfm)
    dfm = dfm.fillna(0)
    df = pd.concat([df, dfm], sort=True, ignore_index=True)
    # add length column and subset the data
    # subtract out existing data at NAICS6 from total data
    # at a length where no suppressed data
    df = df.assign(secLength=df[sector_column].apply(lambda x: len(x)))

    # add column for each state of sector length where
    # there are no missing values
    df_sup = df_sup.assign(
        secLength=df_sup[sector_column].apply(lambda x: len(x)))
    df_sup2 = (df_sup.groupby(
        ['FlowName', 'Compartment',
         'Location'])['secLength'].agg(lambda x: x.min() - 1).reset_index(
             name='secLengthsup'))

    # merge the dfs and sub out the last sector lengths with
    # all data for each state drop states that don't have suppressed dat
    df1 = df.merge(df_sup2)

    df2 = df1[df1['secLength'] == 6].reset_index(drop=True)
    # determine sector to merge on
    df2.loc[:, 'mergeSec'] = df2.apply(
        lambda x: x[sector_column][:x['secLengthsup']], axis=1)

    sum_cols = [
        e for e in fba_default_grouping_fields
        if e not in ['ActivityConsumedBy', 'ActivityProducedBy']
    ]
    sum_cols.append('mergeSec')
    df2 = df2.assign(
        FlowAlloc=df2.groupby(sum_cols)['FlowAmount'].transform('sum'))
    # rename columns for the merge and define merge cols
    df2 = df2.rename(columns={
        sector_column: 'NewNAICS',
        'mergeSec': sector_column
    })
    # keep flows with 0 flow
    df3 = df2[df2['FlowAmount'] == 0].reset_index(drop=True)
    m_cols = groupcols + ['NewNAICS', 'FlowAlloc']
    # merge the two dfs
    dfe = df1.merge(df3[m_cols])
    # add count column used to divide the unallocated flows
    dfe = dfe.assign(
        secCount=dfe.groupby(groupcols)['NewNAICS'].transform('count'))
    dfe = dfe.assign(newFlow=(dfe['FlowAmount'] - dfe['FlowAlloc']) /
                     dfe['secCount'])
    # reassign values and drop columns
    dfe = dfe.assign(FlowAmount=dfe['newFlow'])
    dfe[sector_column] = dfe['NewNAICS'].copy()
    dfe = dfe.drop(columns=['NewNAICS', 'FlowAlloc', 'secCount', 'newFlow'])

    # new df with estimated naics6
    dfn = pd.concat([df, dfe], ignore_index=True)
    dfn2 = dfn[dfn['FlowAmount'] != 0].reset_index(drop=True)
    dfn2 = dfn2.drop(columns=['secLength'])

    dff = sector_aggregation(dfn2, fba_wsec_default_grouping_fields)

    # if activities are source-like, set col values as copies
    # of the sector columns
    if sector_like_activities:
        dff = dff.assign(ActivityProducedBy=dff['SectorProducedBy'])
        dff = dff.assign(ActivityConsumedBy=dff['SectorConsumedBy'])
        # reindex columns
        dff = dff.reindex(df_load.columns, axis=1)

    # replace null values
    dff = replace_strings_with_NoneType(dff).reset_index(drop=True)
    return dff
Example #6
0
def sector_disaggregation(df_load):
    """
    function to disaggregate sectors if there is only one
    naics at a lower level works for lower than naics 4
    :param df_load: A FBS df, must have sector columns
    :return: A FBS df with values for the missing naics5 and naics6
    """

    # ensure None values are not strings
    df = replace_NoneType_with_empty_cells(df_load)

    # determine if activities are sector-like, if aggregating
    # a df with a 'SourceName'
    sector_like_activities = False
    if 'SourceName' in df_load.columns:
        s = pd.unique(df_load['SourceName'])[0]
        sector_like_activities = check_activities_sector_like(s)

    # if activities are source like, drop from df,
    # add back in as copies of sector columns columns to keep
    if sector_like_activities:
        # subset df
        df_cols = [
            e for e in df.columns
            if e not in ('ActivityProducedBy', 'ActivityConsumedBy')
        ]
        df = df[df_cols]

    # load naics 2 to naics 6 crosswalk
    cw_load = load_crosswalk('sector_length')

    # for loop min length to 6 digits, where min length cannot be less than 2
    fields_list = []
    for i in range(2):
        if not (df[fbs_activity_fields[i]] == "").all():
            fields_list.append(fbs_activity_fields[i])

    length = df[fields_list].apply(lambda x: x.str.len()).min().min()
    if length < 2:
        length = 2
    # appends missing naics levels to df
    for i in range(length, 6):
        sector_merge = 'NAICS_' + str(i)
        sector_add = 'NAICS_' + str(i + 1)

        # subset the df by naics length
        cw = cw_load[[sector_merge, sector_add]]
        # only keep the rows where there is only one value
        # in sector_add for a value in sector_merge
        cw = cw.drop_duplicates(subset=[sector_merge],
                                keep=False).reset_index(drop=True)
        sector_list = cw[sector_merge].values.tolist()

        # subset df to sectors with length = i and length = i + 1
        df_subset = df.loc[
            df[fbs_activity_fields[0]].apply(lambda x: i + 1 >= len(x) >= i)
            | df[fbs_activity_fields[1]].apply(lambda x: i + 1 >= len(x) >= i)]
        # create new columns that are length i
        df_subset = df_subset.assign(SectorProduced_tmp=df_subset[
            fbs_activity_fields[0]].apply(lambda x: x[0:i]))
        df_subset = df_subset.assign(SectorConsumed_tmp=df_subset[
            fbs_activity_fields[1]].apply(lambda x: x[0:i]))
        # subset the df to the rows where the tmp sector columns
        # are in naics list
        df_subset_1 = df_subset.loc[
            (df_subset['SectorProduced_tmp'].isin(sector_list))
            & (df_subset['SectorConsumed_tmp'] == "")]
        df_subset_2 = df_subset.loc[(df_subset['SectorProduced_tmp'] == "") & (
            df_subset['SectorConsumed_tmp'].isin(sector_list))]
        df_subset_3 = df_subset.loc[
            (df_subset['SectorProduced_tmp'].isin(sector_list))
            & (df_subset['SectorConsumed_tmp'].isin(sector_list))]
        # concat existing dfs
        df_subset = pd.concat([df_subset_1, df_subset_2, df_subset_3],
                              sort=False)
        # drop all rows with duplicate temp values, as a less aggregated
        # naics exists list of column headers, that if exist in df, should
        # be aggregated using the weighted avg fxn
        possible_column_headers = ('Flowable', 'FlowName', 'Unit', 'Context',
                                   'Compartment', 'Location', 'Year',
                                   'SectorProduced_tmp', 'SectorConsumed_tmp')
        # list of column headers that do exist in the df being subset
        cols_to_drop = [
            e for e in possible_column_headers
            if e in df_subset.columns.values.tolist()
        ]

        df_subset = df_subset.drop_duplicates(
            subset=cols_to_drop, keep=False).reset_index(drop=True)

        # merge the naics cw
        new_naics = pd.merge(df_subset,
                             cw[[sector_merge, sector_add]],
                             how='left',
                             left_on=['SectorProduced_tmp'],
                             right_on=[sector_merge])
        new_naics = new_naics.rename(columns={sector_add: "SPB"})
        new_naics = new_naics.drop(columns=[sector_merge])
        new_naics = pd.merge(new_naics,
                             cw[[sector_merge, sector_add]],
                             how='left',
                             left_on=['SectorConsumed_tmp'],
                             right_on=[sector_merge])
        new_naics = new_naics.rename(columns={sector_add: "SCB"})
        new_naics = new_naics.drop(columns=[sector_merge])
        # drop columns and rename new sector columns
        new_naics = new_naics.drop(columns=[
            "SectorProducedBy", "SectorConsumedBy", "SectorProduced_tmp",
            "SectorConsumed_tmp"
        ])
        new_naics = new_naics.rename(columns={
            "SPB": "SectorProducedBy",
            "SCB": "SectorConsumedBy"
        })
        # append new naics to df
        new_naics['SectorConsumedBy'] = \
            new_naics['SectorConsumedBy'].replace({np.nan: ""})
        new_naics['SectorProducedBy'] = \
            new_naics['SectorProducedBy'].replace({np.nan: ""})
        new_naics = replace_NoneType_with_empty_cells(new_naics)
        df = pd.concat([df, new_naics], sort=True, ignore_index=True)
    # replace blank strings with None
    df = replace_strings_with_NoneType(df)

    # if activities are source-like, set col values
    # as copies of the sector columns
    if sector_like_activities:
        df = df.assign(ActivityProducedBy=df['SectorProducedBy'])
        df = df.assign(ActivityConsumedBy=df['SectorConsumedBy'])
        # reindex columns
        df = df.reindex(df_load.columns, axis=1)

    return df
Example #7
0
def sector_aggregation(df_load, group_cols):
    """
    Function that checks if a sector length exists, and if not,
    sums the less aggregated sector
    :param df_load: Either a flowbyactivity df with sectors or
       a flowbysector df
    :param group_cols: columns by which to aggregate
    :return: df, with aggregated sector values
    """
    # ensure None values are not strings
    df = replace_NoneType_with_empty_cells(df_load)

    # determine if activities are sector-like,
    # if aggregating a df with a 'SourceName'
    sector_like_activities = False
    if 'SourceName' in df_load.columns:
        s = pd.unique(df_load['SourceName'])[0]
        sector_like_activities = check_activities_sector_like(s)

    # if activities are source like, drop from df and group calls,
    # add back in as copies of sector columns columns to keep
    if sector_like_activities:
        group_cols = [
            e for e in group_cols
            if e not in ('ActivityProducedBy', 'ActivityConsumedBy')
        ]
        # subset df
        df_cols = [
            e for e in df.columns
            if e not in ('ActivityProducedBy', 'ActivityConsumedBy')
        ]
        df = df[df_cols]

    # find the longest length sector
    length = df[[fbs_activity_fields[0], fbs_activity_fields[1]
                 ]].apply(lambda x: x.str.len()).max().max()
    length = int(length)
    # for loop in reverse order longest length naics minus 1 to 2
    # appends missing naics levels to df
    for i in range(length, 2, -1):
        # df where either sector column is length or both columns are
        df1 = df[((df['SectorProducedBy'].apply(lambda x: len(x) == i)) |
                  (df['SectorConsumedBy'].apply(lambda x: len(x) == i)))
                 | ((df['SectorProducedBy'].apply(lambda x: len(x) == i)) &
                    (df['SectorConsumedBy'].apply(lambda x: len(x) == i)))]

        # add new columns dropping last digit of sectors
        df1 = df1.assign(
            SPB=df1['SectorProducedBy'].apply(lambda x: x[0:i - 1]))
        df1 = df1.assign(
            SCB=df1['SectorConsumedBy'].apply(lambda x: x[0:i - 1]))

        # second dataframe where length is l - 1
        df2 = df[((df['SectorProducedBy'].apply(lambda x: len(x) == i - 1)) |
                  (df['SectorConsumedBy'].apply(lambda x: len(x) == i - 1)))
                 | ((df['SectorProducedBy'].apply(lambda x: len(x) == i - 1)) &
                    (df['SectorConsumedBy'].apply(lambda x: len(x) == i - 1))
                    )].rename(columns={
                        'SectorProducedBy': 'SPB',
                        'SectorConsumedBy': 'SCB'
                    })

        # merge the dfs
        merge_cols = [col for col in df2.columns if hasattr(df2[col], 'str')]
        # also drop activity and description cols
        merge_cols = [
            c for c in merge_cols if c not in
            ['ActivityConsumedBy', 'ActivityProducedBy', 'Description']
        ]

        if len(df2) > 0:
            dfm = df1.merge(df2[merge_cols],
                            how='outer',
                            on=merge_cols,
                            indicator=True).query('_merge=="left_only"').drop(
                                '_merge', axis=1)
        else:
            dfm = df1.copy(deep=True)

        if len(dfm) > 0:
            # replace the SCB and SPB columns then aggregate and add to df
            dfm['SectorProducedBy'] = dfm['SPB']
            dfm['SectorConsumedBy'] = dfm['SCB']
            dfm = dfm.drop(columns=(['SPB', 'SCB']))
            # aggregate the new sector flow amounts
            agg_sectors = aggregator(dfm, group_cols)
            # append to df
            agg_sectors = replace_NoneType_with_empty_cells(agg_sectors)
            df = df.append(agg_sectors, sort=False).reset_index(drop=True)
    df = df.drop_duplicates()

    # if activities are source-like, set col values as
    # copies of the sector columns
    if sector_like_activities:
        df = df.assign(ActivityProducedBy=df['SectorProducedBy'])
        df = df.assign(ActivityConsumedBy=df['SectorConsumedBy'])
        # reindex columns
        df = df.reindex(df_load.columns, axis=1)

    # replace null values
    df = replace_strings_with_NoneType(df).reset_index(drop=True)

    return df
Example #8
0
def get_fba_allocation_subset(fba_allocation, source, activitynames, **kwargs):
    """
    Subset the fba allocation data based on NAICS associated with activity
    :param fba_allocation: df, FBA format
    :param source: str, source name
    :param activitynames: list, activity names in activity set
    :param kwargs: can be the mapping file and method of allocation
    :return: df, FBA subset
    """
    # first determine if there are special cases that would modify the
    # typical method of subset an example of a special case is when the
    # allocation method is 'proportional-flagged'
    subset_by_sector_cols = False
    subset_by_column_value = False
    if kwargs != {}:
        if 'flowSubsetMapped' in kwargs:
            fsm = kwargs['flowSubsetMapped']
        if 'allocMethod' in kwargs:
            am = kwargs['allocMethod']
            if am == 'proportional-flagged':
                subset_by_sector_cols = True
        if 'activity_set_names' in kwargs:
            asn = kwargs['activity_set_names']
            if asn is not None:
                if 'allocation_subset_col' in asn:
                    subset_by_column_value = True

    if check_activities_sector_like(source) is False:
        # read in source crosswalk
        df = get_activitytosector_mapping(source)
        sec_source_name = df['SectorSourceName'][0]
        df = expand_naics_list(df, sec_source_name)
        # subset source crosswalk to only contain values
        # pertaining to list of activity names
        df = df.loc[df['Activity'].isin(activitynames)]
        # turn column of sectors related to activity names into list
        sector_list = pd.unique(df['Sector']).tolist()
        # subset fba allocation table to the values in
        # the activity list, based on overlapping sectors
        if 'Sector' in fba_allocation:
            fba_allocation_subset =\
                fba_allocation.loc[fba_allocation['Sector'].isin(
                    sector_list)].reset_index(drop=True)
        else:
            fba_allocation_subset = \
                fba_allocation.loc[
                    (fba_allocation[fbs_activity_fields[0]].isin(sector_list)
                     ) |
                    (fba_allocation[fbs_activity_fields[1]].isin(sector_list)
                     )].reset_index(drop=True)
    else:
        if 'Sector' in fba_allocation:
            fba_allocation_subset =\
                fba_allocation.loc[fba_allocation['Sector'].isin(
                    activitynames)].reset_index(drop=True)
        elif subset_by_sector_cols:
            # if it is a special case, then base the subset of data on
            # sectors in the sector columns, not on activitynames
            fsm_sub = fsm.loc[
                (fsm[fba_activity_fields[0]].isin(activitynames)) |
                (fsm[fba_activity_fields[1]].isin(activitynames))].reset_index(
                    drop=True)
            part1 = fsm_sub[['SectorConsumedBy']]
            part2 = fsm_sub[['SectorProducedBy']]
            part1.columns = ['Sector']
            part2.columns = ['Sector']
            modified_activitynames = \
                pd.concat([part1, part2], ignore_index=True).drop_duplicates()
            modified_activitynames = modified_activitynames[
                modified_activitynames['Sector'].notnull()]
            modified_activitynames = modified_activitynames['Sector'].tolist()
            fba_allocation_subset = fba_allocation.loc[
                (fba_allocation[fbs_activity_fields[0]].
                 isin(modified_activitynames)) |
                (fba_allocation[fbs_activity_fields[1]].
                 isin(modified_activitynames))].reset_index(drop=True)

        else:
            fba_allocation_subset = fba_allocation.loc[
                (fba_allocation[fbs_activity_fields[0]].isin(activitynames)) |
                (fba_allocation[fbs_activity_fields[1]].isin(activitynames)
                 )].reset_index(drop=True)

    # if activity set names included in function call and activity set names
    # is not null, then subset data based on value and column specified
    if subset_by_column_value:
        # create subset of activity names and allocation subset metrics
        asn_subset = \
            asn[asn['name'].isin(activitynames)].reset_index(drop=True)
        if asn_subset['allocation_subset'].isna().all():
            pass
        elif asn_subset['allocation_subset'].isna().any():
            log.error('Define column and value to subset on in the activity '
                      'set csv for all rows')
        else:
            col_to_subset = asn_subset['allocation_subset_col'][0]
            val_to_subset = asn_subset['allocation_subset'][0]
            # subset fba_allocation_subset further
            log.debug('Subset the allocation dataset where %s = %s',
                      str(col_to_subset), str(val_to_subset))
            fba_allocation_subset = fba_allocation_subset[
                fba_allocation_subset[col_to_subset] ==
                val_to_subset].reset_index(drop=True)

    return fba_allocation_subset