Ejemplo n.º 1
0
def load_map_clean_fba(method, attr, fba_sourcename, df_year, flowclass,
                       geoscale_from, geoscale_to, **kwargs):
    """
    Load, clean, and map a FlowByActivity df
    :param method: dictionary, FBS method yaml
    :param attr: dictionary, attribute data from method yaml for activity set
    :param fba_sourcename: str, source name
    :param df_year: str, year
    :param flowclass: str, flowclass to subset df with
    :param geoscale_from: str, geoscale to use
    :param geoscale_to: str, geoscale to aggregate to
    :param kwargs: dictionary, can include parameters: 'allocation_flow',
                   'allocation_compartment','clean_allocation_fba', 'clean_allocation_fba_w_sec'
    :return: df, fba format
    """

    log.info("Loading allocation flowbyactivity %s for year %s", fba_sourcename, str(df_year))
    fba = load_fba_w_standardized_units(datasource=fba_sourcename,
                                        year=df_year,
                                        flowclass=flowclass)

    # check if allocation data exists at specified geoscale to use
    log.info("Checking if allocation data exists at the %s level", geoscale_from)
    check_if_data_exists_at_geoscale(fba, geoscale_from)

    # aggregate geographically to the scale of the flowbyactivty source, if necessary
    fba = subset_df_by_geoscale(fba, geoscale_from, geoscale_to)

    # subset based on yaml settings
    if 'flowname_subset' in kwargs:
        if kwargs['flowname_subset'] != 'None':
            fba = fba.loc[fba['FlowName'].isin(kwargs['flowname_subset'])]

    if 'compartment_subset' in kwargs:
        if kwargs['compartment_subset'] != 'None':
            fba = fba.loc[fba['Compartment'].isin(kwargs['compartment_subset'])]
    # cleanup the fba allocation df, if necessary
    if 'clean_fba' in kwargs:
        log.info("Cleaning %s", fba_sourcename)
        fba = dynamically_import_fxn(fba_sourcename, kwargs["clean_fba"])(fba, attr=attr)
    # reset index
    fba = fba.reset_index(drop=True)

    # assign sector to allocation dataset
    log.info("Adding sectors to %s", fba_sourcename)
    fba_wsec = add_sectors_to_flowbyactivity(fba, sectorsourcename=method['target_sector_source'])

    # call on fxn to further clean up/disaggregate the fba allocation data, if exists
    if 'clean_fba_w_sec' in kwargs:
        log.info("Further disaggregating sectors in %s", fba_sourcename)
        fba_wsec = dynamically_import_fxn(fba_sourcename,
                                          kwargs['clean_fba_w_sec'])(fba_wsec, attr=attr,
                                                                     method=method,
                                                                     sourcename=fba_sourcename)

    return fba_wsec
Ejemplo n.º 2
0
def load_source_dataframe(k, v):
    """
    Load the source dataframe. Data can be a FlowbyActivity or
    FlowBySector parquet stored in flowsa, or a FlowBySector
    formatted dataframe from another package.
    :param k: str, The datasource name
    :param v: dictionary, The datasource parameters
    :return: df of identified parquet
    """
    if v['data_format'] == 'FBA':
        # if yaml specifies a geoscale to load, use parameter to filter dataframe
        if 'source_fba_load_scale' in v:
            geo_level = v['source_fba_load_scale']
        else:
            geo_level = None
        vLog.info("Retrieving flowbyactivity for datasource %s in year %s", k,
                  str(v['year']))
        flows_df = flowsa.getFlowByActivity(datasource=k,
                                            year=v['year'],
                                            flowclass=v['class'],
                                            geographic_level=geo_level)
    elif v['data_format'] == 'FBS':
        vLog.info("Retrieving flowbysector for datasource %s", k)
        flows_df = flowsa.getFlowBySector(k)
    elif v['data_format'] == 'FBS_outside_flowsa':
        vLog.info("Retrieving flowbysector for datasource %s", k)
        flows_df = dynamically_import_fxn(k, v["FBS_datapull_fxn"])(v)
    else:
        vLog.error(
            "Data format not specified in method file for datasource %s", k)

    return flows_df
Ejemplo n.º 3
0
def function_allocation_method(flow_subset_mapped, k, names, attr, fbs_list):
    """
    Allocate df activities to sectors using a function identified in the FBS method yaml
    :param flow_subset_mapped: df, FBA with flows converted using fedelemflowlist
    :param k: str, source name
    :param names: list, activity names in activity set
    :param attr: dictionary, attribute data from method yaml for activity set
    :param fbs_list: list, fbs dfs created running flowbysector.py
    :return: df, FBS, with allocated activity columns to sectors
    """
    log.info('Calling on function specified in method yaml to allocate '
             '%s to sectors', ', '.join(map(str, names)))
    fbs = dynamically_import_fxn(k, attr['allocation_source'])(flow_subset_mapped, attr, fbs_list)
    return fbs
Ejemplo n.º 4
0
def parse_data(dataframe_list, args, config):
    """
    Calls on functions defined in source.py files, as parsing rules are specific to the data source.
    :param dataframe_list: list, dfs to concat and parse
    :param args: dictionary, load parameters 'source' and 'year'
    :param config: dictionary, FBA yaml
    :return: df, single df formatted to FBA
    """
    # if hasattr(sys.modules[__name__], config["parse_response_fxn"]):
    if "parse_response_fxn" in config:
        # dynamically import and call on function
        df = dynamically_import_fxn(args['source'],
                                    config["parse_response_fxn"])(
                                        dataframe_list=dataframe_list,
                                        args=args)
    return df
Ejemplo n.º 5
0
def parse_data(*, df_list, source, year, config):
    """
    Calls on functions defined in source.py files, as parsing rules
    are specific to the data source.
    :param df_list: list, dfs to concat and parse
    :param source: str, data source
    :param year: str, year
    :param config: dictionary, FBA yaml
    :return: df, single df formatted to FBA
    """
    if "parse_response_fxn" in config:
        # dynamically import and call on function
        df = dynamically_import_fxn(
            source, config["parse_response_fxn"])(df_list=df_list,
                                                  source=source,
                                                  year=year,
                                                  config=config)
    return df
Ejemplo n.º 6
0
def assemble_urls_for_query(build_url, config, args):
    """
    Calls on helper functions defined in source.py files to replace parts of the url string
    :param build_url: str, base url
    :param config: dictionary, FBA yaml
    :param args: dictionary, load parameters 'source' and 'year'
    :return: list, urls to call data from
    """

    if "url_replace_fxn" in config:
        # dynamically import and call on function
        urls = dynamically_import_fxn(
            args['source'], config["url_replace_fxn"])(build_url=build_url,
                                                       config=config,
                                                       args=args)
    else:
        urls = []
        urls.append(build_url)
    return urls
Ejemplo n.º 7
0
def load_source_dataframe(sourcename, source_dict, download_FBA_if_missing):
    """
    Load the source dataframe. Data can be a FlowbyActivity or
    FlowBySector parquet stored in flowsa, or a FlowBySector
    formatted dataframe from another package.
    :param sourcename: str, The datasource name
    :param source_dict: dictionary, The datasource parameters
    :param download_FBA_if_missing: Bool, if True will download FBAs from
       Data Commons. Default is False.
    :return: df of identified parquet
    """
    if source_dict['data_format'] == 'FBA':
        # if yaml specifies a geoscale to load, use parameter
        # to filter dataframe
        if 'source_fba_load_scale' in source_dict:
            geo_level = source_dict['source_fba_load_scale']
        else:
            geo_level = None
        vLog.info("Retrieving Flow-By-Activity for datasource %s in year %s",
                  sourcename, str(source_dict['year']))
        flows_df = flowsa.getFlowByActivity(
            datasource=sourcename,
            year=source_dict['year'],
            flowclass=source_dict['class'],
            geographic_level=geo_level,
            download_FBA_if_missing=download_FBA_if_missing)
    elif source_dict['data_format'] == 'FBS':
        vLog.info("Retrieving flowbysector for datasource %s", sourcename)
        flows_df = flowsa.getFlowBySector(sourcename)
    elif source_dict['data_format'] == 'FBS_outside_flowsa':
        vLog.info("Retrieving flowbysector for datasource %s", sourcename)
        flows_df = dynamically_import_fxn(
            sourcename, source_dict["FBS_datapull_fxn"])(source_dict)
    else:
        vLog.error(
            "Data format not specified in method "
            "file for datasource %s", sourcename)

    return flows_df
Ejemplo n.º 8
0
def call_urls(url_list, args, config):
    """
    This method calls all the urls that have been generated.
    It then calls the processing method to begin processing the returned data.
    The processing method is specific to
    the data source, so this function relies on a function in source.py
    :param url_list: list, urls to call
    :param args: dictionary, load parameters 'source' and 'year'
    :param config: dictionary, FBA yaml
    :return: list, dfs to concat and parse
    """
    # start requests session
    s = requests.Session()
    # identify if url request requires cookies set
    if 'allow_http_request_cookies' in config:
        set_cookies = 'yes'
    else:
        set_cookies = 'no'

    # create dataframes list by iterating through url list
    data_frames_list = []
    if url_list[0] is not None:
        for url in url_list:
            log.info("Calling %s", url)
            r = make_http_request(url,
                                  requests_session=s,
                                  set_cookies=set_cookies)
            if "call_response_fxn" in config:
                # dynamically import and call on function
                df = dynamically_import_fxn(
                    args['source'], config["call_response_fxn"])(url=url,
                                                                 r=r,
                                                                 args=args)
            if isinstance(df, pd.DataFrame):
                data_frames_list.append(df)
            elif isinstance(df, list):
                data_frames_list.extend(df)

    return data_frames_list
Ejemplo n.º 9
0
def call_urls(*, url_list, source, year, config):
    """
    This method calls all the urls that have been generated.
    It then calls the processing method to begin processing the returned data.
    The processing method is specific to
    the data source, so this function relies on a function in source.py
    :param url_list: list, urls to call
    :param source: str, data source
    :param year: str, year
    :param config: dictionary, FBA yaml
    :return: list, dfs to concat and parse
    """
# identify if url request requires cookies set
    set_cookies = config.get('allow_http_request_cookies')
    confirm_gdrive = config.get('confirm_gdrive')

    # create dataframes list by iterating through url list
    data_frames_list = []
    if url_list[0] is not None:
        for url in url_list:
            log.info("Calling %s", url)
            resp = make_url_request(url,
                                    set_cookies=set_cookies,
                                    confirm_gdrive=confirm_gdrive)
            if "call_response_fxn" in config:
                # dynamically import and call on function
                df = dynamically_import_fxn(
                    source, config["call_response_fxn"])(resp=resp,
                                                         source=source,
                                                         year=year,
                                                         config=config,
                                                         url=url)
            if isinstance(df, pd.DataFrame):
                data_frames_list.append(df)
            elif isinstance(df, list):
                data_frames_list.extend(df)

    return data_frames_list
Ejemplo n.º 10
0
def assemble_urls_for_query(*, source, year, config):
    """
    Calls on helper functions defined in source.py files to
    replace parts of the url string
    :param source: str, data source
    :param year: str, year
    :param config: dictionary, FBA yaml
    :return: list, urls to call data from
    """
    # if there are url parameters defined in the yaml,
    # then build a url, else use "base_url"
    urlinfo = config['url']
    if urlinfo == 'None':
        return [None]

    if 'url_params' in urlinfo:
        params = parse.urlencode(urlinfo['url_params'], safe='=&%',
                                 quote_via=parse.quote)
        build_url = urlinfo['base_url'] + urlinfo['api_path'] + params
    else:
        build_url = urlinfo['base_url']

    # substitute year from arguments and users api key into the url
    build_url = build_url.replace("__year__", str(year))
    if "__apiKey__" in build_url:
        userAPIKey = load_api_key(config['api_name'])  # (common.py fxn)
        build_url = build_url.replace("__apiKey__", userAPIKey)

    if "url_replace_fxn" in config:
        # dynamically import and call on function
        urls = dynamically_import_fxn(
            source, config["url_replace_fxn"])(build_url=build_url,
                                               source=source,
                                               year=year,
                                               config=config)
        return urls
    else:
        return [build_url]
Ejemplo n.º 11
0
def allocation_helper(df_w_sector, attr, method, v, download_FBA_if_missing):
    """
    Function to help allocate activity names using secondary df
    :param df_w_sector: df, includes sector columns
    :param attr: dictionary, attribute data from method yaml for activity set
    :param method: dictionary, FBS method yaml
    :param v: dictionary, the datasource parameters
    :param download_FBA_if_missing: bool, indicate if missing FBAs
       should be downloaded from Data Commons or run locally
    :return: df, with modified fba allocation values
    """
    from flowsa.validation import compare_df_units

    # add parameters to dictionary if exist in method yaml
    fba_dict = {}
    if 'helper_flow' in attr:
        fba_dict['flowname_subset'] = attr['helper_flow']
    if 'clean_helper_fba' in attr:
        fba_dict['clean_fba'] = attr['clean_helper_fba']
    if 'clean_helper_fba_wsec' in attr:
        fba_dict['clean_fba_w_sec'] = attr['clean_helper_fba_wsec']

    # load the allocation FBA
    helper_allocation = \
        load_map_clean_fba(method, attr, fba_sourcename=attr['helper_source'],
                           df_year=attr['helper_source_year'],
                           flowclass=attr['helper_source_class'],
                           geoscale_from=attr['helper_from_scale'],
                           geoscale_to=v['geoscale_to_use'],
                           download_FBA_if_missing=download_FBA_if_missing,
                           **fba_dict)

    # run sector disagg to capture any missing lower level naics
    helper_allocation = sector_disaggregation(helper_allocation)

    # generalize activity field names to enable link to water withdrawal table
    helper_allocation = collapse_activity_fields(helper_allocation)
    # drop any rows not mapped
    helper_allocation = \
        helper_allocation[helper_allocation['Sector'].notnull()]
    # drop columns
    helper_allocation = \
        helper_allocation.drop(columns=['Activity', 'Min', 'Max'])

    # rename column
    helper_allocation = \
        helper_allocation.rename(columns={"FlowAmount": 'HelperFlow'})

    # determine the df_w_sector column to merge on
    df_w_sector = replace_strings_with_NoneType(df_w_sector)
    sec_consumed_list = \
        df_w_sector['SectorConsumedBy'].drop_duplicates().values.tolist()
    sec_produced_list = \
        df_w_sector['SectorProducedBy'].drop_duplicates().values.tolist()
    # if a sector field column is not all 'none', that is the column to merge
    if all(v is None for v in sec_consumed_list):
        sector_col_to_merge = 'SectorProducedBy'
    elif all(v is None for v in sec_produced_list):
        sector_col_to_merge = 'SectorConsumedBy'
    else:
        log.error('There is not a clear sector column to base '
                  'merge with helper allocation dataset')

    # merge allocation df with helper df based on sectors,
    # depending on geo scales of dfs
    if (attr['helper_from_scale'] == 'state') and \
            (attr['allocation_from_scale'] == 'county'):
        helper_allocation.loc[:, 'Location_tmp'] = \
            helper_allocation['Location'].apply(lambda x: x[0:2])
        df_w_sector.loc[:, 'Location_tmp'] = \
            df_w_sector['Location'].apply(lambda x: x[0:2])
        # merge_columns.append('Location_tmp')
        compare_df_units(df_w_sector, helper_allocation)
        modified_fba_allocation =\
            df_w_sector.merge(
                helper_allocation[['Location_tmp', 'Sector', 'HelperFlow']],
                how='left',
                left_on=['Location_tmp', sector_col_to_merge],
                right_on=['Location_tmp', 'Sector'])
        modified_fba_allocation = \
            modified_fba_allocation.drop(columns=['Location_tmp'])
    elif (attr['helper_from_scale'] == 'national') and \
            (attr['allocation_from_scale'] != 'national'):
        compare_df_units(df_w_sector, helper_allocation)
        modified_fba_allocation = \
            df_w_sector.merge(helper_allocation[['Sector', 'HelperFlow']],
                              how='left',
                              left_on=[sector_col_to_merge],
                              right_on=['Sector'])
    else:

        compare_df_units(df_w_sector, helper_allocation)
        modified_fba_allocation =\
            df_w_sector.merge(
                helper_allocation[['Location', 'Sector', 'HelperFlow']],
                left_on=['Location', sector_col_to_merge],
                right_on=['Location', 'Sector'],
                how='left')
        # load bea codes that sub for naics
        bea = return_bea_codes_used_as_naics()
        # replace sector column and helperflow value if the sector column to
        # merge is in the bea list to prevent dropped data
        modified_fba_allocation['Sector'] = \
            np.where(modified_fba_allocation[sector_col_to_merge].isin(bea),
                     modified_fba_allocation[sector_col_to_merge],
                     modified_fba_allocation['Sector'])
        modified_fba_allocation['HelperFlow'] = \
            np.where(modified_fba_allocation[sector_col_to_merge].isin(bea),
                     modified_fba_allocation['FlowAmount'],
                     modified_fba_allocation['HelperFlow'])

    # modify flow amounts using helper data
    if 'multiplication' in attr['helper_method']:
        # if missing values (na or 0), replace with national level values
        replacement_values =\
            helper_allocation[helper_allocation['Location'] ==
                              US_FIPS].reset_index(drop=True)
        replacement_values = \
            replacement_values.rename(
                columns={"HelperFlow": 'ReplacementValue'})
        compare_df_units(modified_fba_allocation, replacement_values)
        modified_fba_allocation = modified_fba_allocation.merge(
            replacement_values[['Sector', 'ReplacementValue']], how='left')
        modified_fba_allocation.loc[:, 'HelperFlow'] = \
            modified_fba_allocation['HelperFlow'].fillna(
            modified_fba_allocation['ReplacementValue'])
        modified_fba_allocation.loc[:, 'HelperFlow'] =\
            np.where(modified_fba_allocation['HelperFlow'] == 0,
                     modified_fba_allocation['ReplacementValue'],
                     modified_fba_allocation['HelperFlow'])

        # replace non-existent helper flow values with a 0,
        # so after multiplying, don't have incorrect value associated with
        # new unit
        modified_fba_allocation['HelperFlow'] =\
            modified_fba_allocation['HelperFlow'].fillna(value=0)
        modified_fba_allocation.loc[:, 'FlowAmount'] = \
            modified_fba_allocation['FlowAmount'] * \
            modified_fba_allocation['HelperFlow']
        # drop columns
        modified_fba_allocation =\
            modified_fba_allocation.drop(
                columns=["HelperFlow", 'ReplacementValue', 'Sector'])

    elif attr['helper_method'] == 'proportional':
        modified_fba_allocation =\
            proportional_allocation_by_location_and_activity(
                modified_fba_allocation, sector_col_to_merge)
        modified_fba_allocation['FlowAmountRatio'] =\
            modified_fba_allocation['FlowAmountRatio'].fillna(0)
        modified_fba_allocation.loc[:, 'FlowAmount'] = \
            modified_fba_allocation['FlowAmount'] * \
            modified_fba_allocation['FlowAmountRatio']
        modified_fba_allocation =\
            modified_fba_allocation.drop(
                columns=['FlowAmountRatio', 'HelperFlow', 'Sector'])

    elif attr['helper_method'] == 'proportional-flagged':
        # calculate denominators based on activity and 'flagged' column
        modified_fba_allocation =\
            modified_fba_allocation.assign(
                Denominator=modified_fba_allocation.groupby(
                    ['FlowName', 'ActivityConsumedBy', 'Location',
                     'disaggregate_flag'])['HelperFlow'].transform('sum'))
        modified_fba_allocation = modified_fba_allocation.assign(
            FlowAmountRatio=modified_fba_allocation['HelperFlow'] /
            modified_fba_allocation['Denominator'])
        modified_fba_allocation =\
            modified_fba_allocation.assign(
                FlowAmount=modified_fba_allocation['FlowAmount'] *
                           modified_fba_allocation['FlowAmountRatio'])
        modified_fba_allocation =\
            modified_fba_allocation.drop(
                columns=['disaggregate_flag', 'Sector', 'HelperFlow',
                         'Denominator', 'FlowAmountRatio'])
        # run sector aggregation
        modified_fba_allocation = \
            sector_aggregation(modified_fba_allocation,
                               fba_wsec_default_grouping_fields)

    # drop rows of 0
    modified_fba_allocation =\
        modified_fba_allocation[
            modified_fba_allocation['FlowAmount'] != 0].reset_index(drop=True)

    modified_fba_allocation.loc[modified_fba_allocation['Unit'] ==
                                'gal/employee', 'Unit'] = 'gal'

    # option to scale up fba values
    if 'scaled' in attr['helper_method']:
        log.info("Scaling %s to FBA values", attr['helper_source'])
        modified_fba_allocation = \
            dynamically_import_fxn(
                attr['allocation_source'], attr["scale_helper_results"])(
                modified_fba_allocation, attr,
                download_FBA_if_missing=download_FBA_if_missing)
    return modified_fba_allocation
Ejemplo n.º 12
0
def main(**kwargs):
    """
    Creates a flowbysector dataset
    :param kwargs: dictionary of arguments, only argument is
        "method_name", the name of method corresponding to flowbysector
        method yaml name
    :return: parquet, FBS save to local folder
    """
    if len(kwargs) == 0:
        kwargs = parse_args()

    method_name = kwargs['method']
    download_FBA_if_missing = kwargs.get('download_FBAs_if_missing')
    # assign arguments
    vLog.info("Initiating flowbysector creation for %s", method_name)
    # call on method
    method = load_yaml_dict(method_name, flowbytype='FBS')
    # create dictionary of data and allocation datasets
    fb = method['source_names']
    # Create empty list for storing fbs files
    fbs_list = []
    for k, v in fb.items():
        # pull fba data for allocation
        flows = load_source_dataframe(k, v, download_FBA_if_missing)

        if v['data_format'] == 'FBA':
            # ensure correct datatypes and that all fields exist
            flows = clean_df(flows,
                             flow_by_activity_fields,
                             fba_fill_na_dict,
                             drop_description=False)

            # clean up fba before mapping, if specified in yaml
            if "clean_fba_before_mapping_df_fxn" in v:
                vLog.info("Cleaning up %s FlowByActivity", k)
                flows = dynamically_import_fxn(
                    k, v["clean_fba_before_mapping_df_fxn"])(flows)

            # map flows to federal flow list or material flow list
            flows_mapped, mapping_files = \
                map_fbs_flows(flows, k, v, keep_fba_columns=True)

            # clean up fba, if specified in yaml
            if "clean_fba_df_fxn" in v:
                vLog.info("Cleaning up %s FlowByActivity", k)
                flows_mapped = dynamically_import_fxn(
                    k, v["clean_fba_df_fxn"])(flows_mapped)

            # if activity_sets are specified in a file, call them here
            if 'activity_set_file' in v:
                aset_names = pd.read_csv(flowbysectoractivitysetspath +
                                         v['activity_set_file'],
                                         dtype=str)
            else:
                aset_names = None

            # master list of activity names read in from data source
            ml_act = []
            # create dictionary of allocation datasets for different activities
            activities = v['activity_sets']
            # subset activity data and allocate to sector
            for aset, attr in activities.items():
                # subset by named activities
                if 'activity_set_file' in v:
                    names = \
                        aset_names[aset_names['activity_set'] == aset]['name']
                else:
                    names = attr['names']

                # to avoid double counting data from the same source, in
                # the event there are values in both the APB and ACB
                # columns, if an activity has already been read in and
                # allocated, remove that activity from the mapped flows
                # regardless of what activity set the data was read in
                flows_mapped = flows_mapped[~(
                    (flows_mapped[fba_activity_fields[0]].isin(ml_act)) |
                    (flows_mapped[fba_activity_fields[1]].isin(ml_act))
                )].reset_index(drop=True)
                ml_act.extend(names)

                vLog.info("Preparing to handle %s in %s", aset, k)
                # subset fba data by activity
                flows_subset = flows_mapped[
                    (flows_mapped[fba_activity_fields[0]].isin(names)) |
                    (flows_mapped[fba_activity_fields[1]].isin(names)
                     )].reset_index(drop=True)

                # subset by flowname if exists
                if 'source_flows' in attr:
                    flows_subset = flows_subset[flows_subset['FlowName'].isin(
                        attr['source_flows'])]
                if len(flows_subset) == 0:
                    log.warning(f"no data found for flows in {aset}")
                    continue
                if len(flows_subset[flows_subset['FlowAmount'] != 0]) == 0:
                    log.warning(f"all flow data for {aset} is 0")
                    continue
                # if activities are sector-like, check sectors are valid
                if check_activities_sector_like(k):
                    flows_subset2 = replace_naics_w_naics_from_another_year(
                        flows_subset, method['target_sector_source'])

                    # check impact on df FlowAmounts
                    vLog.info(
                        'Calculate FlowAmount difference caused by '
                        'replacing NAICS Codes with %s, saving '
                        'difference in Validation log',
                        method['target_sector_source'],
                    )
                    calculate_flowamount_diff_between_dfs(
                        flows_subset, flows_subset2)
                else:
                    flows_subset2 = flows_subset.copy()

                # extract relevant geoscale data or aggregate existing data
                flows_subset_geo = subset_df_by_geoscale(
                    flows_subset2, v['geoscale_to_use'],
                    attr['allocation_from_scale'])
                # if loading data subnational geoscale, check for data loss
                if attr['allocation_from_scale'] != 'national':
                    compare_geographic_totals(flows_subset_geo, flows_mapped,
                                              k, attr, aset, names)

                # Add sectors to df activity, depending on level
                # of specified sector aggregation
                log.info("Adding sectors to %s", k)
                flows_subset_wsec = add_sectors_to_flowbyactivity(
                    flows_subset_geo,
                    sectorsourcename=method['target_sector_source'],
                    allocationmethod=attr['allocation_method'])
                # clean up fba with sectors, if specified in yaml
                if "clean_fba_w_sec_df_fxn" in v:
                    vLog.info("Cleaning up %s FlowByActivity with sectors", k)
                    flows_subset_wsec = dynamically_import_fxn(
                        k, v["clean_fba_w_sec_df_fxn"])(flows_subset_wsec,
                                                        attr=attr,
                                                        method=method)

                # rename SourceName to MetaSources and drop columns
                flows_mapped_wsec = flows_subset_wsec.\
                    rename(columns={'SourceName': 'MetaSources'}).\
                    drop(columns=['FlowName', 'Compartment'])

                # if allocation method is "direct", then no need
                # to create alloc ratios, else need to use allocation
                # dataframe to create sector allocation ratios
                if attr['allocation_method'] == 'direct':
                    fbs = direct_allocation_method(flows_mapped_wsec, k, names,
                                                   method)
                # if allocation method for an activity set requires a specific
                # function due to the complicated nature
                # of the allocation, call on function here
                elif attr['allocation_method'] == 'allocation_function':
                    fbs = function_allocation_method(flows_mapped_wsec, k,
                                                     names, attr, fbs_list)
                else:
                    fbs = dataset_allocation_method(flows_mapped_wsec, attr,
                                                    names, method, k, v, aset,
                                                    aset_names,
                                                    download_FBA_if_missing)

                # drop rows where flowamount = 0
                # (although this includes dropping suppressed data)
                fbs = fbs[fbs['FlowAmount'] != 0].reset_index(drop=True)

                # define grouping columns dependent on sectors
                # being activity-like or not
                if check_activities_sector_like(k) is False:
                    groupingcols = fbs_grouping_fields_w_activities
                    groupingdict = flow_by_sector_fields_w_activity
                else:
                    groupingcols = fbs_default_grouping_fields
                    groupingdict = flow_by_sector_fields

                # clean df
                fbs = clean_df(fbs, groupingdict, fbs_fill_na_dict)

                # aggregate df geographically, if necessary
                log.info("Aggregating flowbysector to %s level",
                         method['target_geoscale'])
                # determine from scale
                if fips_number_key[v['geoscale_to_use']] <\
                        fips_number_key[attr['allocation_from_scale']]:
                    from_scale = v['geoscale_to_use']
                else:
                    from_scale = attr['allocation_from_scale']

                fbs_geo_agg = agg_by_geoscale(fbs, from_scale,
                                              method['target_geoscale'],
                                              groupingcols)

                # aggregate data to every sector level
                log.info("Aggregating flowbysector to all sector levels")
                fbs_sec_agg = sector_aggregation(fbs_geo_agg, groupingcols)
                # add missing naics5/6 when only one naics5/6
                # associated with a naics4
                fbs_agg = sector_disaggregation(fbs_sec_agg)

                # check if any sector information is lost before reaching
                # the target sector length, if so,
                # allocate values equally to disaggregated sectors
                vLog.info(
                    'Searching for and allocating FlowAmounts for any parent '
                    'NAICS that were dropped in the subset to '
                    '%s child NAICS', method['target_sector_level'])
                fbs_agg_2 = equally_allocate_parent_to_child_naics(
                    fbs_agg, method['target_sector_level'])

                # compare flowbysector with flowbyactivity
                compare_activity_to_sector_flowamounts(flows_mapped_wsec,
                                                       fbs_agg_2, aset, k,
                                                       method)

                # return sector level specified in method yaml
                # load the crosswalk linking sector lengths
                sector_list = get_sector_list(method['target_sector_level'])

                # subset df, necessary because not all of the sectors are
                # NAICS and can get duplicate rows
                fbs_1 = fbs_agg_2.loc[
                    (fbs_agg_2[fbs_activity_fields[0]].isin(sector_list)) &
                    (fbs_agg_2[fbs_activity_fields[1]].isin(sector_list))].\
                    reset_index(drop=True)
                fbs_2 = fbs_agg_2.loc[
                    (fbs_agg_2[fbs_activity_fields[0]].isin(sector_list)) &
                    (fbs_agg_2[fbs_activity_fields[1]].isnull())].\
                    reset_index(drop=True)
                fbs_3 = fbs_agg_2.loc[
                    (fbs_agg_2[fbs_activity_fields[0]].isnull()) &
                    (fbs_agg_2[fbs_activity_fields[1]].isin(sector_list))].\
                    reset_index(drop=True)
                fbs_sector_subset = pd.concat([fbs_1, fbs_2, fbs_3])

                # drop activity columns
                fbs_sector_subset = fbs_sector_subset.drop(
                    ['ActivityProducedBy', 'ActivityConsumedBy'],
                    axis=1,
                    errors='ignore')

                # save comparison of FBA total to FBS total for an activity set
                compare_fba_geo_subset_and_fbs_output_totals(
                    flows_subset_geo, fbs_sector_subset, aset, k, v, attr,
                    method)

                log.info("Completed flowbysector for %s", aset)
                fbs_list.append(fbs_sector_subset)
        else:
            if 'clean_fbs_df_fxn' in v:
                flows = dynamically_import_fxn(v["clean_fbs_df_fxn_source"],
                                               v["clean_fbs_df_fxn"])(flows)
            flows = update_geoscale(flows, method['target_geoscale'])
            # if the loaded flow dt is already in FBS format,
            # append directly to list of FBS
            log.info("Append %s to FBS list", k)
            # ensure correct field datatypes and add any missing fields
            flows = clean_df(flows, flow_by_sector_fields, fbs_fill_na_dict)
            fbs_list.append(flows)
    # create single df of all activities
    log.info("Concat data for all activities")
    fbss = pd.concat(fbs_list, ignore_index=True, sort=False)
    log.info("Clean final dataframe")
    # add missing fields, ensure correct data type,
    # add missing columns, reorder columns
    fbss = clean_df(fbss, flow_by_sector_fields, fbs_fill_na_dict)
    # prior to aggregating, replace MetaSources string with all sources
    # that share context/flowable/sector values
    fbss = harmonize_FBS_columns(fbss)
    # aggregate df as activities might have data for
    # the same specified sector length
    fbss = aggregator(fbss, fbs_default_grouping_fields)
    # sort df
    log.info("Sort and store dataframe")
    # ensure correct data types/order of columns
    fbss = clean_df(fbss, flow_by_sector_fields, fbs_fill_na_dict)
    fbss = fbss.sort_values(
        ['SectorProducedBy', 'SectorConsumedBy', 'Flowable',
         'Context']).reset_index(drop=True)
    # check for negative flow amounts
    check_for_negative_flowamounts(fbss)
    # tmp reset data quality scores
    fbss = reset_fbs_dq_scores(fbss)
    # save parquet file
    meta = set_fb_meta(method_name, "FlowBySector")
    write_df_to_file(fbss, paths, meta)
    write_metadata(method_name, method, meta, "FlowBySector")
    # rename the log file saved to local directory
    rename_log_file(method_name, meta)
    log.info(
        'See the Validation log for detailed assessment of '
        'model results in %s', logoutputpath)