Example #1
0
def proportional_allocation_by_location_and_sector(df, sectorcolumn):
    """
    Creates a proportional allocation within each aggregated sector within a location
    :param df:
    :param sectorcolumn:
    :return:
    """
    from flowsa.common import load_source_catalog

    cat = load_source_catalog()
    src_info = cat[pd.unique(df['SourceName'])[0]]
    # load source catalog to determine the level of sector aggregation associated with a crosswalk
    level_of_aggregation = src_info['sector_aggregation_level']

    # denominator summed from highest level of sector grouped by location
    short_length = min(df[sectorcolumn].apply(lambda x: len(str(x))).unique())
    # want to create denominator based on short_length - 1, unless short_length = 2
    denom_df = df.loc[df[sectorcolumn].apply(lambda x: len(x) == short_length)]
    if (level_of_aggregation == 'disaggregated') & (short_length != 2):
        short_length = short_length - 1
        denom_df.loc[:, 'sec_tmp'] = denom_df[sectorcolumn].apply(lambda x: x[0:short_length])
        denom_df.loc[:, 'Denominator'] = denom_df.groupby(['Location', 'sec_tmp'])['FlowAmount'].transform('sum')
    else:  # short_length == 2:]
        denom_df.loc[:, 'Denominator'] = denom_df['FlowAmount']
        denom_df.loc[:, 'sec_tmp'] = denom_df[sectorcolumn]
    # if short_length == 2:
    #     denom_df.loc[:, 'Denominator'] = denom_df['FlowAmount']
    #     denom_df.loc[:, 'sec_tmp'] = denom_df[sectorcolumn]
    # else:
    #     short_length = short_length - 1
    #     denom_df.loc[:, 'sec_tmp'] = denom_df[sectorcolumn].apply(lambda x: x[0:short_length])
    #     denom_df.loc[:, 'Denominator'] = denom_df.groupby(['Location', 'sec_tmp'])['FlowAmount'].transform('sum')

    denom_df_2 = denom_df[['Location', 'LocationSystem', 'Year', 'sec_tmp', 'Denominator']].drop_duplicates()
    # merge the denominator column with fba_w_sector df
    df.loc[:, 'sec_tmp'] = df[sectorcolumn].apply(lambda x: x[0:short_length])
    allocation_df = df.merge(denom_df_2, how='left', left_on=['Location', 'LocationSystem', 'Year', 'sec_tmp'],
                             right_on=['Location', 'LocationSystem', 'Year', 'sec_tmp'])
    # calculate ratio
    allocation_df.loc[:, 'FlowAmountRatio'] = allocation_df['FlowAmount'] / allocation_df[
        'Denominator']
    allocation_df = allocation_df.drop(columns=['Denominator', 'sec_tmp']).reset_index(drop=True)

    return allocation_df
Example #2
0
def direct_allocation_method(flow_subset_mapped, k, names, method):
    """
    Directly assign activities to sectors
    :param flow_subset_mapped: df, FBA with flows converted using fedelemflowlist
    :param k: str, source name
    :param names: list, activity names in activity set
    :param method: dictionary, FBS method yaml
    :return: df with sector columns
    """
    log.info('Directly assigning activities to sectors')
    fbs = flow_subset_mapped.copy()
    # for each activity, if activities are not sector like, check that there is no data loss
    if load_source_catalog()[k]['sector-like_activities'] is False:
        activity_list = []
        for n in names:
            log.debug('Checking for %s at %s', n, method['target_sector_level'])
            fbs_subset = fbs[((fbs[fba_activity_fields[0]] == n) &
                              (fbs[fba_activity_fields[1]] == n)) |
                             (fbs[fba_activity_fields[0]] == n) |
                             (fbs[fba_activity_fields[1]] == n)].reset_index(drop=True)
            fbs_subset = allocate_dropped_sector_data(fbs_subset, method['target_sector_level'])
            activity_list.append(fbs_subset)
        fbs = pd.concat(activity_list, ignore_index=True)
    return fbs
Example #3
0
def add_sectors_to_flowbyactivity(flowbyactivity_df,
                                  sectorsourcename=sector_source_name):
    """
    Add Sectors from the Activity fields and mapped them to Sector from the crosswalk.
    No allocation is performed.
    :param flowbyactivity_df: A standard flowbyactivity data frame
    :param sectorsourcename: A sector source name, using package default
    :return: a df with activity fields mapped to 'sectors'
    """

    mappings = []

    # First check if source activities are NAICS like - if so make it into a mapping file

    cat = load_source_catalog()

    for s in pd.unique(flowbyactivity_df['SourceName']):
        src_info = cat[s]
        # read the pre-determined level of sector aggregation of each crosswalk from the source catalog
        levelofSectoragg = src_info['sector_aggregation_level']
        # if data are provided in NAICS format, use the mastercrosswalk
        if src_info['sector-like_activities']:
            cw = load_sector_crosswalk()
            sectors = cw.loc[:, [sector_source_name]]
            # Create mapping df that's just the sectors at first
            mapping = sectors.drop_duplicates()
            # Add the sector twice as activities so mapping is identical
            mapping = mapping.assign(Activity=sectors[sector_source_name])
            mapping = mapping.rename(columns={sector_source_name: "Sector"})
            # add columns so can run expand_naics_list_fxn
            # if sector-like_activities = True, missing columns, so add
            mapping['ActivitySourceName'] = s
            # tmp assignment
            mapping['SectorType'] = None
            # Include all digits of naics in mapping, if levelofNAICSagg is specified as "aggregated"
            if levelofSectoragg == 'aggregated':
                mapping = expand_naics_list(mapping, sectorsourcename)
        else:
            # if source data activities are text strings, call on the manually created source crosswalks
            mapping = get_activitytosector_mapping(s)
            # filter by SectorSourceName of interest
            mapping = mapping[mapping['SectorSourceName'] == sectorsourcename]
            # drop SectorSourceName
            mapping = mapping.drop(columns=['SectorSourceName'])
            # Include all digits of naics in mapping, if levelofNAICSagg is specified as "aggregated"
            if levelofSectoragg == 'aggregated':
                mapping = expand_naics_list(mapping, sectorsourcename)
        mappings.append(mapping)
    mappings_df = pd.concat(mappings, sort=False)
    # Merge in with flowbyactivity by
    flowbyactivity_wsector_df = flowbyactivity_df
    for k, v in activity_fields.items():
        sector_direction = k
        flowbyactivity_field = v[0]["flowbyactivity"]
        flowbysector_field = v[1]["flowbysector"]
        sector_type_field = sector_direction + 'SectorType'
        mappings_df_tmp = mappings_df.rename(
            columns={
                'Activity': flowbyactivity_field,
                'Sector': flowbysector_field,
                'SectorType': sector_type_field
            })
        # column doesn't exist for sector-like activities, so ignore if error occurs
        mappings_df_tmp = mappings_df_tmp.drop(columns=['ActivitySourceName'],
                                               errors='ignore')
        # Merge them in. Critical this is a left merge to preserve all unmapped rows
        flowbyactivity_wsector_df = pd.merge(flowbyactivity_wsector_df,
                                             mappings_df_tmp,
                                             how='left',
                                             on=flowbyactivity_field)
    flowbyactivity_wsector_df = flowbyactivity_wsector_df.replace(
        {np.nan: None})

    return flowbyactivity_wsector_df
Example #4
0
def get_fba_allocation_subset(fba_allocation, source, activitynames, **kwargs):
    """
    Subset the fba allocation data based on NAICS associated with activity
    :param fba_allocation:
    :param sourcename:
    :param activitynames:
    :param kwargs: can be the mapping file and method of allocation
    :return:
    """

    # first determine if there are special cases that would modify the typical method of subset
    # an example of a special case is when the allocation method is 'proportional-flagged'
    if kwargs != {}:
        special_case = False
        if 'flowSubsetMapped' in kwargs:
            fsm = kwargs['flowSubsetMapped']
        if 'allocMethod' in kwargs:
            am = kwargs['allocMethod']
            if am == 'proportional-flagged':
                special_case = True
    else:
        special_case = False

    # load the source catalog
    cat = load_source_catalog()
    src_info = cat[source]
    if src_info['sector-like_activities'] is False:
        # read in source crosswalk
        df = get_activitytosector_mapping(source)
        sector_source_name = df['SectorSourceName'].all()
        df = expand_naics_list(df, sector_source_name)
        # subset source crosswalk to only contain values pertaining to list of activity names
        df = df.loc[df['Activity'].isin(activitynames)]
        # turn column of sectors related to activity names into list
        sector_list = pd.unique(df['Sector']).tolist()
        # subset fba allocation table to the values in the activity list, based on overlapping sectors
        if 'Sector' in fba_allocation:
            fba_allocation_subset = fba_allocation.loc[
                fba_allocation['Sector'].isin(sector_list)].reset_index(
                    drop=True)
        else:
            fba_allocation_subset = fba_allocation.loc[
                (fba_allocation[fbs_activity_fields[0]].isin(sector_list)) |
                (fba_allocation[fbs_activity_fields[1]].isin(sector_list)
                 )].reset_index(drop=True)
    else:
        if 'Sector' in fba_allocation:
            fba_allocation_subset = fba_allocation.loc[
                fba_allocation['Sector'].isin(activitynames)].reset_index(
                    drop=True)
        elif special_case:
            # if it is a special case, then base the subset of data on sectors in the sector columns, not on activitynames
            fsm_sub = fsm.loc[
                (fsm[fba_activity_fields[0]].isin(activitynames)) |
                (fsm[fba_activity_fields[1]].isin(activitynames))].reset_index(
                    drop=True)
            part1 = fsm_sub[['SectorConsumedBy']]
            part2 = fsm_sub[['SectorProducedBy']]
            part1.columns = ['Sector']
            part2.columns = ['Sector']
            modified_activitynames = pd.concat(
                [part1, part2], ignore_index=True).drop_duplicates()
            modified_activitynames = modified_activitynames[
                modified_activitynames['Sector'].notnull()]
            modified_activitynames = modified_activitynames['Sector'].tolist()
            fba_allocation_subset = fba_allocation.loc[
                (fba_allocation[fbs_activity_fields[0]].
                 isin(modified_activitynames)) |
                (fba_allocation[fbs_activity_fields[1]].
                 isin(modified_activitynames))].reset_index(drop=True)

        else:
            fba_allocation_subset = fba_allocation.loc[
                (fba_allocation[fbs_activity_fields[0]].isin(activitynames)) |
                (fba_allocation[fbs_activity_fields[1]].isin(activitynames)
                 )].reset_index(drop=True)

    return fba_allocation_subset
Example #5
0
def add_sectors_to_flowbyactivity(flowbyactivity_df,
                                  sectorsourcename=sector_source_name,
                                  **kwargs):
    """
    Add Sectors from the Activity fields and mapped them to Sector from the crosswalk.
    No allocation is performed.
    :param flowbyactivity_df: A standard flowbyactivity data frame
    :param sectorsourcename: A sector source name, using package default
    :param kwargs: option to include the parameter 'allocationmethod', which modifies function behavoir if = 'direct'
    :return: a df with activity fields mapped to 'sectors'
    """
    # First check if source activities are NAICS like - if so make it into a mapping file

    cat = load_source_catalog()

    # for s in pd.unique(flowbyactivity_df['SourceName']):
    s = pd.unique(flowbyactivity_df['SourceName'])[0]
    # load catalog info for source
    src_info = cat[s]
    # if activities are sector-like, check if need to modify mapping
    if 'modify_sector-like_activities' in src_info:
        modify_sector_like_activities = src_info[
            'modify_sector-like_activities']
    else:
        modify_sector_like_activities = False
    # read the pre-determined level of sector aggregation of each crosswalk from the source catalog
    levelofSectoragg = src_info['sector_aggregation_level']
    # if the FBS activity set is 'direct', overwrite the levelofsectoragg, or if specified in fxn call
    if kwargs != {}:
        if 'allocationmethod' in kwargs:
            if kwargs['allocationmethod'] == 'direct':
                levelofSectoragg = 'disaggregated'
        if 'overwrite_sectorlevel' in kwargs:
            levelofSectoragg = kwargs['overwrite_sectorlevel']
    # if data are provided in NAICS format, use the mastercrosswalk
    if src_info[
            'sector-like_activities'] and modify_sector_like_activities is False:
        cw = load_sector_crosswalk()
        sectors = cw.loc[:, [sector_source_name]]
        # Create mapping df that's just the sectors at first
        mapping = sectors.drop_duplicates()
        # Add the sector twice as activities so mapping is identical
        mapping = mapping.assign(Activity=sectors[sector_source_name])
        mapping = mapping.rename(columns={sector_source_name: "Sector"})
        # add columns so can run expand_naics_list_fxn
        # if sector-like_activities = True, missing columns, so add
        mapping['ActivitySourceName'] = s
        # tmp assignment
        mapping['SectorType'] = None
        # Include all digits of naics in mapping, if levelofNAICSagg is specified as "aggregated"
        if levelofSectoragg == 'aggregated':
            mapping = expand_naics_list(mapping, sectorsourcename)
    else:
        # if source data activities are text strings, or sector-like activities should be modified, \
        # call on the manually created source crosswalks
        mapping = get_activitytosector_mapping(s)
        # filter by SectorSourceName of interest
        mapping = mapping[mapping['SectorSourceName'] == sectorsourcename]
        # drop SectorSourceName
        mapping = mapping.drop(columns=['SectorSourceName'])
        # Include all digits of naics in mapping, if levelofNAICSagg is specified as "aggregated"
        if levelofSectoragg == 'aggregated':
            mapping = expand_naics_list(mapping, sectorsourcename)
    # Merge in with flowbyactivity by
    flowbyactivity_wsector_df = flowbyactivity_df
    for k, v in activity_fields.items():
        sector_direction = k
        flowbyactivity_field = v[0]["flowbyactivity"]
        flowbysector_field = v[1]["flowbysector"]
        sector_type_field = sector_direction + 'SectorType'
        mappings_df_tmp = mapping.rename(
            columns={
                'Activity': flowbyactivity_field,
                'Sector': flowbysector_field,
                'SectorType': sector_type_field
            })
        # column doesn't exist for sector-like activities, so ignore if error occurs
        mappings_df_tmp = mappings_df_tmp.drop(columns=['ActivitySourceName'],
                                               errors='ignore')
        # Merge them in. Critical this is a left merge to preserve all unmapped rows
        flowbyactivity_wsector_df = pd.merge(flowbyactivity_wsector_df,
                                             mappings_df_tmp,
                                             how='left',
                                             on=flowbyactivity_field)
    flowbyactivity_wsector_df = flowbyactivity_wsector_df.replace(
        {np.nan: None})
    # add sector source name
    flowbyactivity_wsector_df = flowbyactivity_wsector_df.assign(
        SectorSourceName=sectorsourcename)

    # if activities are sector-like check that the sectors are in the crosswalk
    if src_info['sector-like_activities']:
        flowbyactivity_wsector_df = replace_naics_w_naics_2012(
            flowbyactivity_wsector_df, sectorsourcename)

    return flowbyactivity_wsector_df
Example #6
0
def main(method_name):
    """
    Creates a flowbysector dataset
    :param method_name: Name of method corresponding to flowbysector method yaml name
    :return: flowbysector
    """

    log.info("Initiating flowbysector creation for " + method_name)
    # call on method
    method = load_method(method_name)
    # create dictionary of data and allocation datasets
    fb = method['source_names']
    # Create empty list for storing fbs files
    fbs_list = []
    for k, v in fb.items():
        # pull fba data for allocation
        flows = load_source_dataframe(k, v)

        if v['data_format'] == 'FBA':
            # ensure correct datatypes and that all fields exist
            flows = clean_df(flows,
                             flow_by_activity_fields,
                             fba_fill_na_dict,
                             drop_description=False)

            # clean up fba, if specified in yaml
            if v["clean_fba_df_fxn"] != 'None':
                log.info("Cleaning up " + k + " FlowByActivity")
                flows = getattr(sys.modules[__name__],
                                v["clean_fba_df_fxn"])(flows)

            # if activity_sets are specified in a file, call them here
            if 'activity_set_file' in v:
                aset_names = pd.read_csv(flowbysectoractivitysetspath +
                                         v['activity_set_file'],
                                         dtype=str)

            # create dictionary of allocation datasets for different activities
            activities = v['activity_sets']
            # subset activity data and allocate to sector
            for aset, attr in activities.items():
                # subset by named activities
                if 'activity_set_file' in v:
                    names = aset_names[aset_names['activity_set'] ==
                                       aset]['name']
                else:
                    names = attr['names']

                log.info("Preparing to handle subset of flownames " +
                         ', '.join(map(str, names)) + " in " + k)
                # subset fba data by activity
                flows_subset = flows[
                    (flows[fba_activity_fields[0]].isin(names)) |
                    (flows[fba_activity_fields[1]].isin(names))].reset_index(
                        drop=True)

                # extract relevant geoscale data or aggregate existing data
                log.info("Subsetting/aggregating dataframe to " +
                         attr['allocation_from_scale'] + " geoscale")
                flows_subset_geo = subset_df_by_geoscale(
                    flows_subset, v['geoscale_to_use'],
                    attr['allocation_from_scale'])

                # Add sectors to df activity, depending on level of specified sector aggregation
                log.info("Adding sectors to " + k)
                flow_subset_wsec = add_sectors_to_flowbyactivity(
                    flows_subset_geo,
                    sectorsourcename=method['target_sector_source'],
                    allocationmethod=attr['allocation_method'])
                # clean up fba with sectors, if specified in yaml
                if v["clean_fba_w_sec_df_fxn"] != 'None':
                    log.info("Cleaning up " + k +
                             " FlowByActivity with sectors")
                    flow_subset_wsec = getattr(sys.modules[__name__],
                                               v["clean_fba_w_sec_df_fxn"])(
                                                   flow_subset_wsec, attr=attr)

                # map df to elementary flows
                log.info("Mapping flows in " + k +
                         ' to federal elementary flow list')
                if 'fedefl_mapping' in v:
                    mapping_files = v['fedefl_mapping']
                else:
                    mapping_files = k

                flow_subset_mapped = map_elementary_flows(
                    flow_subset_wsec, mapping_files)

                # clean up mapped fba with sectors, if specified in yaml
                if "clean_mapped_fba_w_sec_df_fxn" in v:
                    log.info("Cleaning up " + k +
                             " FlowByActivity with sectors")
                    flow_subset_mapped = getattr(
                        sys.modules[__name__],
                        v["clean_mapped_fba_w_sec_df_fxn"])(flow_subset_mapped,
                                                            attr, method)

                # if allocation method is "direct", then no need to create alloc ratios, else need to use allocation
                # dataframe to create sector allocation ratios
                if attr['allocation_method'] == 'direct':
                    log.info('Directly assigning ' +
                             ', '.join(map(str, names)) + ' to sectors')
                    fbs = flow_subset_mapped.copy()
                    # for each activity, if activities are not sector like, check that there is no data loss
                    if load_source_catalog(
                    )[k]['sector-like_activities'] is False:
                        activity_list = []
                        for n in names:
                            log.info('Checking for ' + n + ' at ' +
                                     method['target_sector_level'])
                            fbs_subset = fbs[(
                                (fbs[fba_activity_fields[0]] == n) &
                                (fbs[fba_activity_fields[1]] == n)) |
                                             (fbs[fba_activity_fields[0]] == n)
                                             |
                                             (fbs[fba_activity_fields[1]] == n
                                              )].reset_index(drop=True)
                            fbs_subset = check_if_losing_sector_data(
                                fbs_subset, method['target_sector_level'])
                            activity_list.append(fbs_subset)
                        fbs = pd.concat(activity_list, ignore_index=True)

                # if allocation method for an activity set requires a specific function due to the complicated nature
                # of the allocation, call on function here
                elif attr['allocation_method'] == 'allocation_function':
                    log.info(
                        'Calling on function specified in method yaml to allocate '
                        + ', '.join(map(str, names)) + ' to sectors')
                    fbs = getattr(sys.modules[__name__],
                                  attr['allocation_source'])(
                                      flow_subset_mapped, attr, fbs_list)

                else:
                    # determine appropriate allocation dataset
                    log.info("Loading allocation flowbyactivity " +
                             attr['allocation_source'] + " for year " +
                             str(attr['allocation_source_year']))
                    fba_allocation = flowsa.getFlowByActivity(
                        flowclass=[attr['allocation_source_class']],
                        datasource=attr['allocation_source'],
                        years=[attr['allocation_source_year']
                               ]).reset_index(drop=True)

                    # clean df and harmonize unites
                    fba_allocation = clean_df(fba_allocation,
                                              flow_by_activity_fields,
                                              fba_fill_na_dict)
                    fba_allocation = harmonize_units(fba_allocation)

                    # check if allocation data exists at specified geoscale to use
                    log.info("Checking if allocation data exists at the " +
                             attr['allocation_from_scale'] + " level")
                    check_if_data_exists_at_geoscale(
                        fba_allocation, attr['allocation_from_scale'])

                    # aggregate geographically to the scale of the flowbyactivty source, if necessary
                    fba_allocation = subset_df_by_geoscale(
                        fba_allocation, attr['allocation_from_scale'],
                        v['geoscale_to_use'])

                    # subset based on yaml settings
                    if attr['allocation_flow'] != 'None':
                        fba_allocation = fba_allocation.loc[
                            fba_allocation['FlowName'].isin(
                                attr['allocation_flow'])]
                    if attr['allocation_compartment'] != 'None':
                        fba_allocation = fba_allocation.loc[
                            fba_allocation['Compartment'].isin(
                                attr['allocation_compartment'])]

                    # cleanup the fba allocation df, if necessary
                    if 'clean_allocation_fba' in attr:
                        log.info("Cleaning " + attr['allocation_source'])
                        fba_allocation = getattr(sys.modules[__name__],
                                                 attr["clean_allocation_fba"])(
                                                     fba_allocation, attr=attr)
                    # reset index
                    fba_allocation = fba_allocation.reset_index(drop=True)

                    # assign sector to allocation dataset
                    log.info("Adding sectors to " + attr['allocation_source'])
                    fba_allocation_wsec = add_sectors_to_flowbyactivity(
                        fba_allocation,
                        sectorsourcename=method['target_sector_source'])

                    # call on fxn to further clean up/disaggregate the fba allocation data, if exists
                    if 'clean_allocation_fba_w_sec' in attr:
                        log.info("Further disaggregating sectors in " +
                                 attr['allocation_source'])
                        fba_allocation_wsec = getattr(
                            sys.modules[__name__],
                            attr["clean_allocation_fba_w_sec"])(
                                fba_allocation_wsec, attr=attr, method=method)

                    # subset fba datasets to only keep the sectors associated with activity subset
                    log.info("Subsetting " + attr['allocation_source'] +
                             " for sectors in " + k)
                    fba_allocation_subset = get_fba_allocation_subset(
                        fba_allocation_wsec,
                        k,
                        names,
                        flowSubsetMapped=flow_subset_mapped,
                        allocMethod=attr['allocation_method'])

                    # if there is an allocation helper dataset, modify allocation df
                    if attr['allocation_helper'] == 'yes':
                        log.info(
                            "Using the specified allocation help for subset of "
                            + attr['allocation_source'])
                        fba_allocation_subset = allocation_helper(
                            fba_allocation_subset, attr, method, v)

                    # create flow allocation ratios for each activity
                    # if load_source_catalog()[k]['sector-like_activities']
                    flow_alloc_list = []
                    group_cols = fba_mapped_default_grouping_fields
                    group_cols = [
                        e for e in group_cols
                        if e not in ('ActivityProducedBy',
                                     'ActivityConsumedBy')
                    ]
                    for n in names:
                        log.info("Creating allocation ratios for " + n)
                        fba_allocation_subset_2 = get_fba_allocation_subset(
                            fba_allocation_subset,
                            k, [n],
                            flowSubsetMapped=flow_subset_mapped,
                            allocMethod=attr['allocation_method'])
                        if len(fba_allocation_subset_2) == 0:
                            log.info("No data found to allocate " + n)
                        else:
                            flow_alloc = allocate_by_sector(
                                fba_allocation_subset_2,
                                k,
                                attr['allocation_source'],
                                attr['allocation_method'],
                                group_cols,
                                flowSubsetMapped=flow_subset_mapped)
                            flow_alloc = flow_alloc.assign(FBA_Activity=n)
                            flow_alloc_list.append(flow_alloc)
                    flow_allocation = pd.concat(flow_alloc_list,
                                                ignore_index=True)

                    # generalize activity field names to enable link to main fba source
                    log.info("Generalizing activity columns in subset of " +
                             attr['allocation_source'])
                    flow_allocation = collapse_activity_fields(flow_allocation)

                    # check for issues with allocation ratios
                    check_allocation_ratios(flow_allocation, aset, k,
                                            method_name)

                    # create list of sectors in the flow allocation df, drop any rows of data in the flow df that \
                    # aren't in list
                    sector_list = flow_allocation['Sector'].unique().tolist()

                    # subset fba allocation table to the values in the activity list, based on overlapping sectors
                    flow_subset_mapped = flow_subset_mapped.loc[
                        (flow_subset_mapped[fbs_activity_fields[0]].
                         isin(sector_list)) |
                        (flow_subset_mapped[fbs_activity_fields[1]].
                         isin(sector_list))]

                    # check if fba and allocation dfs have the same LocationSystem
                    log.info(
                        "Checking if flowbyactivity and allocation dataframes use the same location systems"
                    )
                    check_if_location_systems_match(flow_subset_mapped,
                                                    flow_allocation)

                    # merge fba df w/flow allocation dataset
                    log.info("Merge " + k + " and subset of " +
                             attr['allocation_source'])
                    fbs = flow_subset_mapped.merge(
                        flow_allocation[[
                            'Location', 'Sector', 'FlowAmountRatio',
                            'FBA_Activity'
                        ]],
                        left_on=[
                            'Location', 'SectorProducedBy',
                            'ActivityProducedBy'
                        ],
                        right_on=['Location', 'Sector', 'FBA_Activity'],
                        how='left')

                    fbs = fbs.merge(
                        flow_allocation[[
                            'Location', 'Sector', 'FlowAmountRatio',
                            'FBA_Activity'
                        ]],
                        left_on=[
                            'Location', 'SectorConsumedBy',
                            'ActivityConsumedBy'
                        ],
                        right_on=['Location', 'Sector', 'FBA_Activity'],
                        how='left')

                    # merge the flowamount columns
                    fbs.loc[:, 'FlowAmountRatio'] = fbs[
                        'FlowAmountRatio_x'].fillna(fbs['FlowAmountRatio_y'])
                    # fill null rows with 0 because no allocation info
                    fbs['FlowAmountRatio'] = fbs['FlowAmountRatio'].fillna(0)

                    # check if fba and alloc dfs have data for same geoscales - comment back in after address the 'todo'
                    # log.info("Checking if flowbyactivity and allocation dataframes have data at the same locations")
                    # check_if_data_exists_for_same_geoscales(fbs, k, attr['names'])

                    # drop rows where there is no allocation data
                    fbs = fbs.dropna(subset=['Sector_x', 'Sector_y'],
                                     how='all').reset_index()

                    # calculate flow amounts for each sector
                    log.info("Calculating new flow amounts using flow ratios")
                    fbs.loc[:, 'FlowAmount'] = fbs['FlowAmount'] * fbs[
                        'FlowAmountRatio']

                    # drop columns
                    log.info("Cleaning up new flow by sector")
                    fbs = fbs.drop(columns=[
                        'Sector_x', 'FlowAmountRatio_x', 'Sector_y',
                        'FlowAmountRatio_y', 'FlowAmountRatio',
                        'FBA_Activity_x', 'FBA_Activity_y'
                    ])

                # drop rows where flowamount = 0 (although this includes dropping suppressed data)
                fbs = fbs[fbs['FlowAmount'] != 0].reset_index(drop=True)

                # define grouping columns dependent on sectors being activity-like or not
                if load_source_catalog()[k]['sector-like_activities'] is False:
                    groupingcols = fbs_grouping_fields_w_activities
                    groupingdict = flow_by_sector_fields_w_activity
                else:
                    groupingcols = fbs_default_grouping_fields
                    groupingdict = flow_by_sector_fields

                # clean df
                fbs = clean_df(fbs, groupingdict, fbs_fill_na_dict)

                # aggregate df geographically, if necessary
                # todo: replace with fxn return_from_scale
                log.info("Aggregating flowbysector to " +
                         method['target_geoscale'] + " level")
                if fips_number_key[v['geoscale_to_use']] < fips_number_key[
                        attr['allocation_from_scale']]:
                    from_scale = v['geoscale_to_use']
                else:
                    from_scale = attr['allocation_from_scale']

                to_scale = method['target_geoscale']

                fbs_geo_agg = agg_by_geoscale(fbs, from_scale, to_scale,
                                              groupingcols)

                # aggregate data to every sector level
                log.info("Aggregating flowbysector to all sector levels")
                fbs_sec_agg = sector_aggregation(fbs_geo_agg, groupingcols)
                # add missing naics5/6 when only one naics5/6 associated with a naics4
                fbs_agg = sector_disaggregation(fbs_sec_agg, groupingdict)

                # check if any sector information is lost before reaching the target sector length, if so,
                # allocate values equally to disaggregated sectors
                log.info('Checking for data at ' +
                         method['target_sector_level'])
                fbs_agg_2 = check_if_losing_sector_data(
                    fbs_agg, method['target_sector_level'])

                # compare flowbysector with flowbyactivity
                # todo: modify fxn to work if activities are sector like in df being allocated
                if load_source_catalog()[k]['sector-like_activities'] is False:
                    check_for_differences_between_fba_load_and_fbs_output(
                        flow_subset_mapped, fbs_agg_2, aset, k, method_name)

                # return sector level specified in method yaml
                # load the crosswalk linking sector lengths
                sector_list = get_sector_list(method['target_sector_level'])

                # subset df, necessary because not all of the sectors are NAICS and can get duplicate rows
                fbs_1 = fbs_agg_2.loc[
                    (fbs_agg_2[fbs_activity_fields[0]].isin(sector_list))
                    & (fbs_agg_2[fbs_activity_fields[1]].isin(sector_list)
                       )].reset_index(drop=True)
                fbs_2 = fbs_agg_2.loc[
                    (fbs_agg_2[fbs_activity_fields[0]].isin(sector_list)) &
                    (fbs_agg_2[fbs_activity_fields[1]].isnull())].reset_index(
                        drop=True)
                fbs_3 = fbs_agg_2.loc[
                    (fbs_agg_2[fbs_activity_fields[0]].isnull())
                    & (fbs_agg_2[fbs_activity_fields[1]].isin(sector_list)
                       )].reset_index(drop=True)
                fbs_sector_subset = pd.concat([fbs_1, fbs_2, fbs_3])

                # drop activity columns
                fbs_sector_subset = fbs_sector_subset.drop(
                    ['ActivityProducedBy', 'ActivityConsumedBy'],
                    axis=1,
                    errors='ignore')

                # save comparison of FBA total to FBS total for an activity set
                compare_fba_load_and_fbs_output_totals(flows_subset_geo,
                                                       fbs_sector_subset, aset,
                                                       k, method_name, attr,
                                                       method, mapping_files)

                log.info(
                    "Completed flowbysector for activity subset with flows " +
                    ', '.join(map(str, names)))
                fbs_list.append(fbs_sector_subset)
        else:
            # if the loaded flow dt is already in FBS format, append directly to list of FBS
            log.info("Append " + k + " to FBS list")
            # ensure correct field datatypes and add any missing fields
            flows = clean_df(flows, flow_by_sector_fields, fbs_fill_na_dict)
            fbs_list.append(flows)
    # create single df of all activities
    log.info("Concat data for all activities")
    fbss = pd.concat(fbs_list, ignore_index=True, sort=False)
    log.info("Clean final dataframe")
    # aggregate df as activities might have data for the same specified sector length
    fbss = clean_df(fbss, flow_by_sector_fields, fbs_fill_na_dict)
    fbss = aggregator(fbss, fbs_default_grouping_fields)
    # sort df
    log.info("Sort and store dataframe")
    # add missing fields, ensure correct data type, reorder columns
    fbss = fbss.sort_values(
        ['SectorProducedBy', 'SectorConsumedBy', 'Flowable',
         'Context']).reset_index(drop=True)
    # save parquet file
    store_flowbysector(fbss, method_name)
Example #7
0
def compare_fba_geo_subset_and_fbs_output_totals(fba_load, fbs_load,
                                                 activity_set, source_name,
                                                 source_attr, activity_attr,
                                                 method):
    """
    Function to compare the loaded flowbyactivity total after subsetting by activity
    and geography with the final flowbysector output total. Not a direct comparison
    of the loaded FBA because FBAs are modified before being subset by activity.
    for the target sector level
    :param fba_load: df, FBA loaded, before being mapped
    :param fbs_load: df, final FBS df at target sector level
    :param activity_set: str, activity set
    :param source_name: str, source name
    :param source_attr: dictionary, attribute data from method yaml for source data
    :param activity_attr: dictionary, attribute data from method yaml for activity set
    :param method: dictionary, FBS method yaml
    :return: printout data differences between loaded FBA and FBS output totals by location,
             save results as csv in local directory
    """

    vLog.info('Comparing Flow-By-Activity subset by activity and geography to '
              'the subset Flow-By-Sector FlowAmount total.')

    # load source catalog
    cat = load_source_catalog()
    src_info = cat[source_name]

    # extract relevant geoscale data or aggregate existing data
    fba = subset_df_by_geoscale(fba_load,
                                activity_attr['allocation_from_scale'],
                                method['target_geoscale'])
    if src_info['sector-like_activities']:
        # if activities are sector-like, run sector aggregation and then
        # subset df to only keep NAICS2
        fba = fba[[
            'Class', 'FlowAmount', 'Unit', 'Context', 'ActivityProducedBy',
            'ActivityConsumedBy', 'Location', 'LocationSystem'
        ]]
        # rename the activity cols to sector cols for purposes of aggregation
        fba = fba.rename(
            columns={
                'ActivityProducedBy': 'SectorProducedBy',
                'ActivityConsumedBy': 'SectorConsumedBy'
            })
        group_cols_agg = [
            'Class', 'Context', 'Unit', 'Location', 'LocationSystem',
            'SectorProducedBy', 'SectorConsumedBy'
        ]
        fba = sector_aggregation(fba, group_cols_agg)
        # subset fba to only include NAICS2
        fba = replace_NoneType_with_empty_cells(fba)
        fba = fba[fba['SectorConsumedBy'].apply(lambda x: len(x) == 2)
                  | fba['SectorProducedBy'].apply(lambda x: len(x) == 2)]
    # subset/agg dfs
    col_subset = [
        'Class', 'FlowAmount', 'Unit', 'Context', 'Location', 'LocationSystem'
    ]
    group_cols = ['Class', 'Unit', 'Context', 'Location', 'LocationSystem']
    # check units
    compare_df_units(fba, fbs_load)
    # fba
    fba = fba[col_subset]
    fba_agg = aggregator(fba, group_cols).reset_index(drop=True)
    fba_agg.rename(columns={
        'FlowAmount': 'FBA_amount',
        'Unit': 'FBA_unit'
    },
                   inplace=True)

    # fbs
    fbs = fbs_load[col_subset]
    fbs_agg = aggregator(fbs, group_cols)
    fbs_agg.rename(columns={
        'FlowAmount': 'FBS_amount',
        'Unit': 'FBS_unit'
    },
                   inplace=True)

    try:
        # merge FBA and FBS totals
        df_merge = fba_agg.merge(fbs_agg, how='left')
        df_merge['FlowAmount_difference'] = df_merge['FBA_amount'] - df_merge[
            'FBS_amount']
        df_merge['Percent_difference'] =\
            (df_merge['FlowAmount_difference']/df_merge['FBA_amount']) * 100

        # reorder
        df_merge = df_merge[[
            'Class', 'Context', 'Location', 'LocationSystem', 'FBA_amount',
            'FBA_unit', 'FBS_amount', 'FBS_unit', 'FlowAmount_difference',
            'Percent_difference'
        ]]
        df_merge = replace_NoneType_with_empty_cells(df_merge)

        # list of contexts and locations
        context_list = df_merge[['Context', 'Location']].values.tolist()

        # loop through the contexts and print results of comparison
        vLog.info(
            'Comparing FBA %s %s subset to FBS results. Details in Validation Log',
            activity_set, source_attr['geoscale_to_use'])
        for i, j in context_list:
            df_merge_subset = df_merge[(df_merge['Context'] == i) & (
                df_merge['Location'] == j)].reset_index(drop=True)
            diff_per = df_merge_subset['Percent_difference'][0]
            if np.isnan(diff_per):
                vLog.info(
                    'FlowBySector FlowAmount for %s %s %s '
                    'does not exist in the FBS', source_name, activity_set, i)
                continue
            # make reporting more manageable
            if abs(diff_per) > 0.01:
                diff_per = round(diff_per, 2)
            else:
                diff_per = round(diff_per, 6)

            # diff_units = df_merge_subset['FBS_unit'][0]
            if diff_per > 0:
                vLog.info(
                    'FlowBySector FlowAmount for %s %s %s at %s is %s%% '
                    'less than the FlowByActivity FlowAmount', source_name,
                    activity_set, i, j, str(abs(diff_per)))
            elif diff_per < 0:
                vLog.info(
                    'FlowBySector FlowAmount for %s %s %s at %s is %s%% '
                    'more than the FlowByActivity FlowAmount', source_name,
                    activity_set, i, j, str(abs(diff_per)))
            elif diff_per == 0:
                vLogDetailed.info(
                    'FlowBySector FlowAmount for %s %s %s at %s is '
                    'equal to the FlowByActivity FlowAmount', source_name,
                    activity_set, i, j)

        # subset the df to include in the validation log
        # only print rows where the percent difference does not round to 0
        df_v = df_merge[df_merge['Percent_difference'].apply(
            lambda x: round(x, 3) != 0)].reset_index(drop=True)

        # log output
        log.info(
            'Save the comparison of FlowByActivity load to FlowBySector '
            'total FlowAmounts for %s in validation log file', activity_set)
        # if df not empty, print, if empty, print string
        if df_v.empty:
            vLogDetailed.info('Percent difference for %s all round to 0',
                              activity_set)
        else:
            vLogDetailed.info(
                'Comparison of FBA load to FBS total '
                'FlowAmounts for %s: '
                '\n {}'.format(df_v.to_string()), activity_set)
    except:
        vLog.info('Error occurred when comparing total FlowAmounts '
                  'for FlowByActivity and FlowBySector')
Example #8
0
def compare_activity_to_sector_flowamounts(fba_load, fbs_load, activity_set,
                                           source_name, config):
    """
    Function to compare the loaded flowbyactivity with the final flowbysector
    by activityname (if exists) to target sector level
    output, checking for data loss
    :param fba_load: df, FBA loaded and mapped using FEDEFL
    :param fbs_load: df, final FBS df
    :param activity_set: str, activity set
    :param source_name: str, source name
    :param config: dictionary, method yaml
    :return: printout data differences between loaded FBA and FBS output,
             save results as csv in local directory
    """
    if load_source_catalog()[source_name]['sector-like_activities']:
        vLog.debug(
            'Not comparing loaded FlowByActivity to FlowBySector ratios '
            'for a dataset with sector-like activities because if there '
            'are modifications to flowamounts for a sector, then the '
            'ratios will be different')
    else:
        # subset fba df
        fba = fba_load[[
            'Class', 'MetaSources', 'Flowable', 'Unit', 'FlowType',
            'ActivityProducedBy', 'ActivityConsumedBy', 'Context', 'Location',
            'LocationSystem', 'Year', 'FlowAmount'
        ]].drop_duplicates().reset_index(drop=True)
        fba.loc[:, 'Location'] = US_FIPS
        group_cols = [
            'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit',
            'FlowType', 'Context', 'Location', 'LocationSystem', 'Year'
        ]
        fba_agg = aggregator(fba, group_cols)
        fba_agg.rename(columns={'FlowAmount': 'FBA_amount'}, inplace=True)

        # subset fbs df

        fbs = fbs_load[[
            'Class', 'SectorSourceName', 'Flowable', 'Unit', 'FlowType',
            'SectorProducedBy', 'SectorConsumedBy', 'ActivityProducedBy',
            'ActivityConsumedBy', 'Context', 'Location', 'LocationSystem',
            'Year', 'FlowAmount'
        ]].drop_duplicates().reset_index(drop=True)

        fbs = replace_NoneType_with_empty_cells(fbs)

        fbs['ProducedLength'] = fbs['SectorProducedBy'].str.len()
        fbs['ConsumedLength'] = fbs['SectorConsumedBy'].str.len()
        fbs['SectorLength'] = fbs[['ProducedLength',
                                   'ConsumedLength']].max(axis=1)
        fbs.loc[:, 'Location'] = US_FIPS
        group_cols = [
            'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit',
            'FlowType', 'Context', 'Location', 'LocationSystem', 'Year',
            'SectorLength'
        ]
        fbs_agg = aggregator(fbs, group_cols)
        fbs_agg.rename(columns={'FlowAmount': 'FBS_amount'}, inplace=True)

        # merge compare 1 and compare 2
        df_merge = fba_agg.merge(fbs_agg,
                                 left_on=[
                                     'ActivityProducedBy',
                                     'ActivityConsumedBy', 'Flowable', 'Unit',
                                     'FlowType', 'Context', 'Location',
                                     'LocationSystem', 'Year'
                                 ],
                                 right_on=[
                                     'ActivityProducedBy',
                                     'ActivityConsumedBy', 'Flowable', 'Unit',
                                     'FlowType', 'Context', 'Location',
                                     'LocationSystem', 'Year'
                                 ],
                                 how='left')
        df_merge['Ratio'] = df_merge['FBS_amount'] / df_merge['FBA_amount']

        # reorder
        df_merge = df_merge[[
            'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit',
            'FlowType', 'Context', 'Location', 'LocationSystem', 'Year',
            'SectorLength', 'FBA_amount', 'FBS_amount', 'Ratio'
        ]]

        # keep onlyrows of specified sector length
        comparison = df_merge[df_merge['SectorLength'] == sector_level_key[
            config['target_sector_level']]].reset_index(drop=True)

        tolerance = 0.01
        comparison2 = comparison[(comparison['Ratio'] < 1 - tolerance) |
                                 (comparison['Ratio'] > 1 + tolerance)]

        if len(comparison2) > 0:
            vLog.info(
                'There are %s combinations of flowable/context/sector length where the '
                'flowbyactivity to flowbysector ratio is less than or greater than 1 by %s',
                len(comparison2), str(tolerance))

        # include df subset in the validation log
        # only print rows where flowamount ratio is less than 1 (round flowamountratio)
        df_v = comparison2[comparison2['Ratio'].apply(
            lambda x: round(x, 3) < 1)].reset_index(drop=True)

        # save to validation log
        log.info(
            'Save the comparison of FlowByActivity load to FlowBySector ratios '
            'for %s in validation log', activity_set)
        # if df not empty, print, if empty, print string
        if df_v.empty:
            vLogDetailed.info('Ratios for %s all round to 1', activity_set)
        else:
            vLogDetailed.info(
                'Comparison of FlowByActivity load to FlowBySector ratios for %s: '
                '\n {}'.format(df_v.to_string()), activity_set)
Example #9
0
def compare_fba_load_and_fbs_output_totals(fba_load, fbs_load, activity_set,
                                           source_name, method_name, attr,
                                           method, mapping_files):
    """
    Function to compare the loaded flowbyactivity total with the final flowbysector output total
    :param df:
    :return:
    """

    from flowsa.flowbyfunctions import subset_df_by_geoscale, sector_aggregation
    from flowsa.common import load_source_catalog
    from flowsa.mapping import map_elementary_flows

    log.info(
        'Comparing loaded FlowByActivity FlowAmount total to subset FlowBySector FlowAmount total'
    )

    # load source catalog
    cat = load_source_catalog()
    src_info = cat[source_name]

    # extract relevant geoscale data or aggregate existing data
    fba = subset_df_by_geoscale(fba_load, attr['allocation_from_scale'],
                                method['target_geoscale'])
    # map loaded fba
    fba = map_elementary_flows(fba, mapping_files, keep_unmapped_rows=True)
    if src_info['sector-like_activities']:
        # if activities are sector-like, run sector aggregation and then subset df to only keep NAICS2
        fba = fba[[
            'Class', 'FlowAmount', 'Unit', 'Context', 'ActivityProducedBy',
            'ActivityConsumedBy', 'Location', 'LocationSystem'
        ]]
        # rename the activity cols to sector cols for purposes of aggregation
        fba = fba.rename(
            columns={
                'ActivityProducedBy': 'SectorProducedBy',
                'ActivityConsumedBy': 'SectorConsumedBy'
            })
        group_cols_agg = [
            'Class', 'Context', 'Unit', 'Location', 'LocationSystem',
            'SectorProducedBy', 'SectorConsumedBy'
        ]
        fba = sector_aggregation(fba, group_cols_agg)
        # subset fba to only include NAICS2
        fba = replace_NoneType_with_empty_cells(fba)
        fba = fba[fba['SectorConsumedBy'].apply(lambda x: len(x) == 2)
                  | fba['SectorProducedBy'].apply(lambda x: len(x) == 2)]
    # subset/agg dfs
    col_subset = [
        'Class', 'FlowAmount', 'Unit', 'Context', 'Location', 'LocationSystem'
    ]
    group_cols = ['Class', 'Unit', 'Context', 'Location', 'LocationSystem']
    # fba
    fba = fba[col_subset]
    fba_agg = aggregator(fba, group_cols).reset_index(drop=True)
    fba_agg.rename(columns={
        'FlowAmount': 'FBA_amount',
        'Unit': 'FBA_unit'
    },
                   inplace=True)

    # fbs
    fbs = fbs_load[col_subset]
    fbs_agg = aggregator(fbs, group_cols)
    fbs_agg.rename(columns={
        'FlowAmount': 'FBS_amount',
        'Unit': 'FBS_unit'
    },
                   inplace=True)

    try:
        # merge FBA and FBS totals
        df_merge = fba_agg.merge(fbs_agg, how='left')
        df_merge['FlowAmount_difference'] = df_merge['FBA_amount'] - df_merge[
            'FBS_amount']
        df_merge['Percent_difference'] = (df_merge['FlowAmount_difference'] /
                                          df_merge['FBA_amount']) * 100

        # reorder
        df_merge = df_merge[[
            'Class', 'Context', 'Location', 'LocationSystem', 'FBA_amount',
            'FBA_unit', 'FBS_amount', 'FBS_unit', 'FlowAmount_difference',
            'Percent_difference'
        ]]
        df_merge = replace_NoneType_with_empty_cells(df_merge)

        # list of contexts
        context_list = df_merge['Context'].to_list()

        # loop through the contexts and print results of comparison
        for i in context_list:
            df_merge_subset = df_merge[df_merge['Context'] == i].reset_index(
                drop=True)
            diff_per = df_merge_subset['Percent_difference'][0]
            # make reporting more manageable
            if abs(diff_per) > 0.001:
                diff_per = round(diff_per, 2)
            else:
                diff_per = round(diff_per, 6)

            diff_units = df_merge_subset['FBS_unit'][0]
            if diff_per > 0:
                log.info('The total FlowBySector FlowAmount for ' +
                         source_name + ' ' + activity_set + ' ' + i + ' is ' +
                         str(abs(diff_per)) +
                         '% less than the total FlowByActivity FlowAmount')
            else:
                log.info('The total FlowBySector FlowAmount for ' +
                         source_name + ' ' + activity_set + ' ' + i + ' is ' +
                         str(abs(diff_per)) +
                         '% more than the total FlowByActivity FlowAmount')

        # save csv to output folder
        log.info(
            'Save the comparison of FlowByActivity load to FlowBySector total FlowAmounts for '
            + activity_set + ' in output folder')
        # output data at all sector lengths
        df_merge.to_csv(outputpath + "FlowBySectorMethodAnalysis/" +
                        method_name + '_' + source_name +
                        "_FBA_total_to_FBS_total_FlowAmount_comparison_" +
                        activity_set + ".csv",
                        index=False)

    except:
        log.info(
            'Error occured when comparing total FlowAmounts for FlowByActivity and FlowBySector'
        )

    return None
Example #10
0
def estimate_suppressed_data(df, sector_column, naics_level, sourcename):
    """
    Estimate data suppression, by equally allocating parent NAICS values to child NAICS
    :param df: df with sector columns
    :param sector_column: str, column to estimate suppressed data for
    :param naics_level: numeric, indicate at what NAICS length to base
                        estimated suppresed data off (2 - 5)
    :param sourcename: str, sourcename
    :return: df, with estimated suppressed data
    """

    # exclude nonsectors
    df = replace_NoneType_with_empty_cells(df)

    # find the longest length sector
    max_length = max(df[sector_column].apply(lambda x: len(str(x))).unique())
    # loop through starting at naics_level, use most detailed level possible to save time
    for i in range(naics_level, max_length):
        # create df of i length
        df_x = df.loc[df[sector_column].apply(lambda x: len(x) == i)]
        # create df of i + 1 length
        df_y = df.loc[df[sector_column].apply(lambda x: len(x) == i + 1)]
        # create temp sector columns in df y, that are i digits in length
        df_y = df_y.assign(s_tmp=df_y[sector_column].apply(lambda x: x[0:i]))

        # create list of location and temp activity combos that contain a 0
        missing_sectors_df = df_y[df_y['FlowAmount'] == 0]
        missing_sectors_list = missing_sectors_df[['Location',
                                                   's_tmp']].drop_duplicates().values.tolist()
        # subset the y df
        if len(missing_sectors_list) != 0:
            # new df of sectors that start with missing sectors.
            # drop last digit of the sector and sum flows set conditions
            suppressed_list = []
            for q, r, in missing_sectors_list:
                c1 = df_y['Location'] == q
                c2 = df_y['s_tmp'] == r
                # subset data
                suppressed_list.append(df_y.loc[c1 & c2])
            suppressed_sectors = pd.concat(suppressed_list, sort=False, ignore_index=True)
            # add column of existing allocated data for length of i
            suppressed_sectors['alloc_flow'] =\
                suppressed_sectors.groupby(['Location', 's_tmp'])['FlowAmount'].transform('sum')
            # subset further so only keep rows of 0 value
            suppressed_sectors_sub = suppressed_sectors[suppressed_sectors['FlowAmount'] == 0]
            # add count
            suppressed_sectors_sub = \
                suppressed_sectors_sub.assign(sector_count=
                                              suppressed_sectors_sub.groupby(
                                                  ['Location', 's_tmp']
                                              )['s_tmp'].transform('count'))

            # merge suppressed sector subset with df x
            df_m = pd.merge(df_x,
                            suppressed_sectors_sub[['Class', 'Compartment', 'FlowType',
                                                    'FlowName', 'Location', 'LocationSystem',
                                                    'Unit', 'Year', sector_column, 's_tmp',
                                                    'alloc_flow', 'sector_count']],
                            left_on=['Class', 'Compartment', 'FlowType', 'FlowName',
                                     'Location', 'LocationSystem', 'Unit', 'Year', sector_column],
                            right_on=['Class', 'Compartment', 'FlowType', 'FlowName',
                                      'Location', 'LocationSystem', 'Unit', 'Year', 's_tmp'],
                            how='right')
            # drop any rows where flowamount is none
            df_m = df_m[~df_m['FlowAmount'].isna()]
            # calculate estimated flows by subtracting the flow
            # amount already allocated from total flow of
            # sector one level up and divide by number of sectors with suppressed data
            df_m.loc[:, 'FlowAmount'] = \
                (df_m['FlowAmount'] - df_m['alloc_flow']) / df_m['sector_count']
            # only keep the suppressed sector subset activity columns
            df_m = df_m.drop(columns=[sector_column + '_x', 's_tmp', 'alloc_flow', 'sector_count'])
            df_m = df_m.rename(columns={sector_column + '_y': sector_column})
            # reset activity columns
            if load_source_catalog()[sourcename]['sector-like_activities']:
                df_m = df_m.assign(ActivityProducedBy=df_m['SectorProducedBy'])
                df_m = df_m.assign(ActivityConsumedBy=df_m['SectorConsumedBy'])

            # drop the existing rows with suppressed data and append the new estimates from fba df
            modified_df =\
                pd.merge(df, df_m[['FlowName', 'Location', sector_column]],
                         indicator=True,
                         how='outer').query('_merge=="left_only"').drop('_merge', axis=1)
            df = pd.concat([modified_df, df_m], ignore_index=True)
    df_w_estimated_data = replace_strings_with_NoneType(df)

    return df_w_estimated_data
Example #11
0
def sector_aggregation(df_load, group_cols):
    """
    Function that checks if a sector length exists, and if not, sums the less aggregated sector
    :param df_load: Either a flowbyactivity df with sectors or a flowbysector df
    :param group_cols: columns by which to aggregate
    :return: df, with aggregated sector values
    """

    # determine if activities are sector-like, if aggregating a df with a 'SourceName'
    sector_like_activities = False
    if 'SourceName' in df_load.columns:
        # load source catalog
        cat = load_source_catalog()
        # for s in pd.unique(flowbyactivity_df['SourceName']):
        s = pd.unique(df_load['SourceName'])[0]
        # load catalog info for source
        src_info = cat[s]
        sector_like_activities = src_info['sector-like_activities']

    # ensure None values are not strings
    df = replace_NoneType_with_empty_cells(df_load)

    # if activities are source like, drop from df and group calls,
    # add back in as copies of sector columns columns to keep
    if sector_like_activities:
        group_cols = [e for e in group_cols if e not in
                      ('ActivityProducedBy', 'ActivityConsumedBy')]
        # subset df
        df_cols = [e for e in df.columns if e not in
                   ('ActivityProducedBy', 'ActivityConsumedBy')]
        df = df[df_cols]

    # find the longest length sector
    length = df[[fbs_activity_fields[0], fbs_activity_fields[1]]].apply(
        lambda x: x.str.len()).max().max()
    length = int(length)
    # for loop in reverse order longest length naics minus 1 to 2
    # appends missing naics levels to df
    for i in range(length - 1, 1, -1):
        # subset df to sectors with length = i and length = i + 1
        df_subset = df.loc[df[fbs_activity_fields[0]].apply(lambda x: i + 1 >= len(x) >= i) |
                           df[fbs_activity_fields[1]].apply(lambda x: i + 1 >= len(x) >= i)]
        # create a list of i digit sectors in df subset
        sector_subset = df_subset[
            ['Location', fbs_activity_fields[0],
             fbs_activity_fields[1]]].drop_duplicates().reset_index(drop=True)
        df_sectors = sector_subset.copy()
        df_sectors.loc[:, 'SectorProducedBy'] = \
            df_sectors['SectorProducedBy'].apply(lambda x: x[0:i])
        df_sectors.loc[:, 'SectorConsumedBy'] = \
            df_sectors['SectorConsumedBy'].apply(lambda x: x[0:i])
        sector_list = df_sectors.drop_duplicates().values.tolist()
        # create a list of sectors that are exactly i digits long
        # where either sector column is i digits in length
        df_existing_1 = \
            sector_subset.loc[(sector_subset['SectorProducedBy'].apply(lambda x: len(x) == i)) |
                              (sector_subset['SectorConsumedBy'].apply(lambda x: len(x) == i))]
        # where both sector columns are i digits in length
        df_existing_2 = \
            sector_subset.loc[(sector_subset['SectorProducedBy'].apply(lambda x: len(x) == i)) &
                              (sector_subset['SectorConsumedBy'].apply(lambda x: len(x) == i))]
        # concat existing dfs
        df_existing = pd.concat([df_existing_1, df_existing_2], sort=False)
        existing_sectors = df_existing.drop_duplicates().dropna().values.tolist()
        # list of sectors of length i that are not in sector list
        missing_sectors = [e for e in sector_list if e not in existing_sectors]
        if len(missing_sectors) != 0:
            # new df of sectors that start with missing sectors.
            # drop last digit of the sector and sum flows
            # set conditions
            agg_sectors_list = []
            for q, r, s in missing_sectors:
                c1 = df_subset['Location'] == q
                c2 = df_subset[fbs_activity_fields[0]].apply(lambda x: x[0:i] == r)
                c3 = df_subset[fbs_activity_fields[1]].apply(lambda x: x[0:i] == s)
                # subset data
                agg_sectors_list.append(df_subset.loc[c1 & c2 & c3])
            agg_sectors = pd.concat(agg_sectors_list, sort=False)
            agg_sectors = agg_sectors.loc[
                (agg_sectors[fbs_activity_fields[0]].apply(lambda x: len(x) > i)) |
                (agg_sectors[fbs_activity_fields[1]].apply(lambda x: len(x) > i))]
            agg_sectors.loc[:, fbs_activity_fields[0]] = agg_sectors[fbs_activity_fields[0]].apply(
                lambda x: x[0:i])
            agg_sectors.loc[:, fbs_activity_fields[1]] = agg_sectors[fbs_activity_fields[1]].apply(
                lambda x: x[0:i])
            # aggregate the new sector flow amounts
            agg_sectors = aggregator(agg_sectors, group_cols)
            # append to df
            agg_sectors = replace_NoneType_with_empty_cells(agg_sectors)
            df = df.append(agg_sectors, sort=False).reset_index(drop=True)

    # manually modify non-NAICS codes that might exist in sector
    df.loc[:, 'SectorConsumedBy'] = np.where(df['SectorConsumedBy'].isin(['F0', 'F01']),
                                             'F010', df['SectorConsumedBy'])  # domestic/household
    df.loc[:, 'SectorProducedBy'] = np.where(df['SectorProducedBy'].isin(['F0', 'F01']),
                                             'F010', df['SectorProducedBy'])  # domestic/household
    # drop any duplicates created by modifying sector codes
    df = df.drop_duplicates()
    # if activities are source-like, set col values as copies of the sector columns
    if sector_like_activities:
        df = df.assign(ActivityProducedBy=df['SectorProducedBy'])
        df = df.assign(ActivityConsumedBy=df['SectorConsumedBy'])
        # reindex columns
        df = df.reindex(df_load.columns, axis=1)
    # replace null values
    df = replace_strings_with_NoneType(df)

    return df
Example #12
0
def main(**kwargs):
    """
    Creates a flowbysector dataset
    :param kwargs: dictionary of arguments, only argument is "method_name", the name of method
                   corresponding to flowbysector method yaml name
    :return: parquet, FBS save to local folder
    """
    if len(kwargs) == 0:
        kwargs = parse_args()

    method_name = kwargs['method']
    # assign arguments
    vLog.info("Initiating flowbysector creation for %s", method_name)
    # call on method
    method = load_method(method_name)
    # create dictionary of data and allocation datasets
    fb = method['source_names']
    # Create empty list for storing fbs files
    fbs_list = []
    for k, v in fb.items():
        # pull fba data for allocation
        flows = load_source_dataframe(k, v)

        if v['data_format'] == 'FBA':
            # ensure correct datatypes and that all fields exist
            flows = clean_df(flows,
                             flow_by_activity_fields,
                             fba_fill_na_dict,
                             drop_description=False)

            # map flows to federal flow list or material flow list
            flows_mapped, mapping_files = map_fbs_flows(flows,
                                                        k,
                                                        v,
                                                        keep_fba_columns=True)

            # clean up fba, if specified in yaml
            if "clean_fba_df_fxn" in v:
                vLog.info("Cleaning up %s FlowByActivity", k)
                flows_mapped = dynamically_import_fxn(
                    k, v["clean_fba_df_fxn"])(flows_mapped)

            # if activity_sets are specified in a file, call them here
            if 'activity_set_file' in v:
                aset_names = pd.read_csv(flowbysectoractivitysetspath +
                                         v['activity_set_file'],
                                         dtype=str)
            else:
                aset_names = None

            # create dictionary of allocation datasets for different activities
            activities = v['activity_sets']
            # subset activity data and allocate to sector
            for aset, attr in activities.items():
                # subset by named activities
                if 'activity_set_file' in v:
                    names = aset_names[aset_names['activity_set'] ==
                                       aset]['name']
                else:
                    names = attr['names']

                vLog.info("Preparing to handle %s in %s", aset, k)
                # subset fba data by activity
                flows_subset =\
                    flows_mapped[(flows_mapped[fba_activity_fields[0]].isin(names)) |
                              (flows_mapped[fba_activity_fields[1]].isin(names)
                               )].reset_index(drop=True)

                # if activities are sector-like, check sectors are valid
                if load_source_catalog()[k]['sector-like_activities']:
                    flows_subset2 =\
                        replace_naics_w_naics_from_another_year(flows_subset,
                                                                method['target_sector_source'])
                    # check impact on df FlowAmounts
                    vLog.info(
                        'Calculate FlowAmount difference caused by '
                        'replacing NAICS Codes with %s, saving difference in Validation log',
                        method['target_sector_source'],
                    )
                    calculate_flowamount_diff_between_dfs(
                        flows_subset, flows_subset2)
                else:
                    flows_subset2 = flows_subset.copy()

                # extract relevant geoscale data or aggregate existing data
                flows_subset_geo = subset_df_by_geoscale(
                    flows_subset2, v['geoscale_to_use'],
                    attr['allocation_from_scale'])
                # if loading data subnational geoscale, check for data loss
                if attr['allocation_from_scale'] != 'national':
                    compare_geographic_totals(flows_subset_geo, flows_mapped,
                                              k, attr, aset, names)

                # Add sectors to df activity, depending on level of specified sector aggregation
                log.info("Adding sectors to %s", k)
                flows_subset_wsec =\
                    add_sectors_to_flowbyactivity(flows_subset_geo,
                                                  sectorsourcename=method['target_sector_source'],
                                                  allocationmethod=attr['allocation_method'])
                # clean up fba with sectors, if specified in yaml
                if "clean_fba_w_sec_df_fxn" in v:
                    vLog.info("Cleaning up %s FlowByActivity with sectors", k)
                    flows_subset_wsec = \
                        dynamically_import_fxn(k, v["clean_fba_w_sec_df_fxn"])(flows_subset_wsec,
                                                                               attr=attr,
                                                                               method=method)

                # rename SourceName to MetaSources and drop columns
                flows_mapped_wsec = flows_subset_wsec.\
                    rename(columns={'SourceName': 'MetaSources'}).\
                    drop(columns=['FlowName', 'Compartment'])

                # if allocation method is "direct", then no need to create alloc ratios,
                # else need to use allocation
                # dataframe to create sector allocation ratios
                if attr['allocation_method'] == 'direct':
                    fbs = direct_allocation_method(flows_mapped_wsec, k, names,
                                                   method)
                # if allocation method for an activity set requires a specific
                # function due to the complicated nature
                # of the allocation, call on function here
                elif attr['allocation_method'] == 'allocation_function':
                    fbs = function_allocation_method(flows_mapped_wsec, k,
                                                     names, attr, fbs_list)
                else:
                    fbs =\
                        dataset_allocation_method(flows_mapped_wsec, attr,
                                                  names, method, k, v, aset,
                                                  method_name, aset_names)

                # drop rows where flowamount = 0 (although this includes dropping suppressed data)
                fbs = fbs[fbs['FlowAmount'] != 0].reset_index(drop=True)

                # define grouping columns dependent on sectors being activity-like or not
                if load_source_catalog()[k]['sector-like_activities'] is False:
                    groupingcols = fbs_grouping_fields_w_activities
                    groupingdict = flow_by_sector_fields_w_activity
                else:
                    groupingcols = fbs_default_grouping_fields
                    groupingdict = flow_by_sector_fields

                # clean df
                fbs = clean_df(fbs, groupingdict, fbs_fill_na_dict)

                # aggregate df geographically, if necessary
                log.info("Aggregating flowbysector to %s level",
                         method['target_geoscale'])
                # determine from scale
                if fips_number_key[v['geoscale_to_use']] <\
                        fips_number_key[attr['allocation_from_scale']]:
                    from_scale = v['geoscale_to_use']
                else:
                    from_scale = attr['allocation_from_scale']

                fbs_geo_agg = agg_by_geoscale(fbs, from_scale,
                                              method['target_geoscale'],
                                              groupingcols)

                # aggregate data to every sector level
                log.info("Aggregating flowbysector to all sector levels")
                fbs_sec_agg = sector_aggregation(fbs_geo_agg, groupingcols)
                # add missing naics5/6 when only one naics5/6 associated with a naics4
                fbs_agg = sector_disaggregation(fbs_sec_agg)

                # check if any sector information is lost before reaching
                # the target sector length, if so,
                # allocate values equally to disaggregated sectors
                vLog.info(
                    'Searching for and allocating FlowAmounts for any parent '
                    'NAICS that were dropped in the subset to '
                    '%s child NAICS', method['target_sector_level'])
                fbs_agg_2 = allocate_dropped_sector_data(
                    fbs_agg, method['target_sector_level'])

                # compare flowbysector with flowbyactivity
                compare_activity_to_sector_flowamounts(flows_mapped_wsec,
                                                       fbs_agg_2, aset, k,
                                                       method)

                # return sector level specified in method yaml
                # load the crosswalk linking sector lengths
                sector_list = get_sector_list(method['target_sector_level'])

                # subset df, necessary because not all of the sectors are
                # NAICS and can get duplicate rows
                fbs_1 = fbs_agg_2.loc[(fbs_agg_2[fbs_activity_fields[0]].isin(sector_list)) &
                                      (fbs_agg_2[fbs_activity_fields[1]].isin(sector_list))].\
                    reset_index(drop=True)
                fbs_2 = fbs_agg_2.loc[(fbs_agg_2[fbs_activity_fields[0]].isin(sector_list)) &
                                      (fbs_agg_2[fbs_activity_fields[1]].isnull())].\
                    reset_index(drop=True)
                fbs_3 = fbs_agg_2.loc[(fbs_agg_2[fbs_activity_fields[0]].isnull()) &
                                      (fbs_agg_2[fbs_activity_fields[1]].isin(sector_list))].\
                    reset_index(drop=True)
                fbs_sector_subset = pd.concat([fbs_1, fbs_2, fbs_3])

                # drop activity columns
                fbs_sector_subset = fbs_sector_subset.drop(
                    ['ActivityProducedBy', 'ActivityConsumedBy'],
                    axis=1,
                    errors='ignore')

                # save comparison of FBA total to FBS total for an activity set
                compare_fba_geo_subset_and_fbs_output_totals(
                    flows_subset_geo, fbs_sector_subset, aset, k, v, attr,
                    method)

                log.info("Completed flowbysector for %s", aset)
                fbs_list.append(fbs_sector_subset)
        else:
            # if the loaded flow dt is already in FBS format, append directly to list of FBS
            log.info("Append %s to FBS list", k)
            # ensure correct field datatypes and add any missing fields
            flows = clean_df(flows, flow_by_sector_fields, fbs_fill_na_dict)
            fbs_list.append(flows)
    # create single df of all activities
    log.info("Concat data for all activities")
    fbss = pd.concat(fbs_list, ignore_index=True, sort=False)
    log.info("Clean final dataframe")
    # add missing fields, ensure correct data type, add missing columns, reorder columns
    fbss = clean_df(fbss, flow_by_sector_fields, fbs_fill_na_dict)
    # prior to aggregating, replace MetaSources string with all sources
    # that share context/flowable/sector values
    fbss = harmonize_FBS_columns(fbss)
    # aggregate df as activities might have data for the same specified sector length
    fbss = aggregator(fbss, fbs_default_grouping_fields)
    # sort df
    log.info("Sort and store dataframe")
    # ensure correct data types/order of columns
    fbss = clean_df(fbss, flow_by_sector_fields, fbs_fill_na_dict)
    fbss = fbss.sort_values(
        ['SectorProducedBy', 'SectorConsumedBy', 'Flowable',
         'Context']).reset_index(drop=True)
    # tmp reset data quality scores
    fbss = reset_fbs_dq_scores(fbss)
    # save parquet file
    meta = set_fb_meta(method_name, "FlowBySector")
    write_df_to_file(fbss, paths, meta)
    write_metadata(method_name, method, meta, "FlowBySector")
    # rename the log file saved to local directory
    rename_log_file(method_name, meta)
    log.info(
        'See the Validation log for detailed assessment of model results in %s',
        logoutputpath)
Example #13
0
def get_fba_allocation_subset(fba_allocation, source, activitynames, **kwargs):
    """
    Subset the fba allocation data based on NAICS associated with activity
    :param fba_allocation: df, FBA format
    :param source: str, source name
    :param activitynames: list, activity names in activity set
    :param kwargs: can be the mapping file and method of allocation
    :return: df, FBA subset
    """
    # first determine if there are special cases that would modify the typical method of subset
    # an example of a special case is when the allocation method is 'proportional-flagged'
    subset_by_sector_cols = False
    subset_by_column_value = False
    if kwargs != {}:
        if 'flowSubsetMapped' in kwargs:
            fsm = kwargs['flowSubsetMapped']
        if 'allocMethod' in kwargs:
            am = kwargs['allocMethod']
            if am == 'proportional-flagged':
                subset_by_sector_cols = True
        if 'activity_set_names' in kwargs:
            asn = kwargs['activity_set_names']
            if asn is not None:
                if 'allocation_subset_col' in asn:
                    subset_by_column_value = True

    # load the source catalog
    cat = load_source_catalog()
    src_info = cat[source]
    if src_info['sector-like_activities'] is False:
        # read in source crosswalk
        df = get_activitytosector_mapping(source)
        sec_source_name = df['SectorSourceName'][0]
        df = expand_naics_list(df, sec_source_name)
        # subset source crosswalk to only contain values pertaining to list of activity names
        df = df.loc[df['Activity'].isin(activitynames)]
        # turn column of sectors related to activity names into list
        sector_list = pd.unique(df['Sector']).tolist()
        # subset fba allocation table to the values in
        # the activity list, based on overlapping sectors
        if 'Sector' in fba_allocation:
            fba_allocation_subset =\
                fba_allocation.loc[fba_allocation['Sector'].isin(
                    sector_list)].reset_index(drop=True)
        else:
            fba_allocation_subset = \
                fba_allocation.loc[(fba_allocation[fbs_activity_fields[0]].isin(sector_list)) |
                                   (fba_allocation[fbs_activity_fields[1]].isin(sector_list))]. \
                    reset_index(drop=True)
    else:
        if 'Sector' in fba_allocation:
            fba_allocation_subset =\
                fba_allocation.loc[fba_allocation['Sector'].isin(
                    activitynames)].reset_index(drop=True)
        elif subset_by_sector_cols:
            # if it is a special case, then base the subset of data on
            # sectors in the sector columns, not on activitynames
            fsm_sub = fsm.loc[
                (fsm[fba_activity_fields[0]].isin(activitynames)) |
                (fsm[fba_activity_fields[1]].isin(activitynames))].reset_index(
                    drop=True)
            part1 = fsm_sub[['SectorConsumedBy']]
            part2 = fsm_sub[['SectorProducedBy']]
            part1.columns = ['Sector']
            part2.columns = ['Sector']
            modified_activitynames = pd.concat(
                [part1, part2], ignore_index=True).drop_duplicates()
            modified_activitynames =\
                modified_activitynames[modified_activitynames['Sector'].notnull()]
            modified_activitynames = modified_activitynames['Sector'].tolist()
            fba_allocation_subset = \
                fba_allocation.loc[
                    (fba_allocation[fbs_activity_fields[0]].isin(modified_activitynames)) |
                    (fba_allocation[fbs_activity_fields[1]].isin(modified_activitynames))]. \
                    reset_index(drop=True)

        else:
            fba_allocation_subset =\
                fba_allocation.loc[(fba_allocation[fbs_activity_fields[0]].isin(activitynames)) |
                                   (fba_allocation[fbs_activity_fields[1]].isin(activitynames))].\
                    reset_index(drop=True)

    # if activity set names included in function call and activity set names is not null, \
    # then subset data based on value and column specified
    if subset_by_column_value:
        # create subset of activity names and allocation subset metrics
        asn_subset = asn[asn['name'].isin(activitynames)].reset_index(
            drop=True)
        if asn_subset['allocation_subset'].isna().all():
            pass
        elif asn_subset['allocation_subset'].isna().any():
            log.error(
                'Define column and value to subset on in the activity set csv for all rows'
            )
        else:
            col_to_subset = asn_subset['allocation_subset_col'][0]
            val_to_subset = asn_subset['allocation_subset'][0]
            # subset fba_allocation_subset further
            log.debug('Subset the allocation dataset where %s = %s',
                      str(col_to_subset), str(val_to_subset))
            fba_allocation_subset = fba_allocation_subset[
                fba_allocation_subset[col_to_subset] ==
                val_to_subset].reset_index(drop=True)

    return fba_allocation_subset