Exemple #1
0
def map_fbs_flows(fbs, from_fba_source, v, **kwargs):
    """
    Identifies the mapping file and applies mapping to fbs flows
    :param fbs: flow-by-sector dataframe
    :param from_fba_source: str Source name of fba list to look for mappings
    :param v: dictionary, The datasource parameters
    :param kwargs: includes keep_unmapped_columns and keep_fba_columns
    :return fbs_mapped: df, with flows mapped using federal elementary
           flow list or material flow list
    :return mapping_files: str, name of mapping file
    """
    ignore_source_name = False
    if 'mfl_mapping' in v:
        mapping_files = v['mfl_mapping']
        log.info("Mapping flows in %s to material flow list", from_fba_source)
        flow_type = 'WASTE_FLOW'
        ignore_source_name = True
    else:
        log.info("Mapping flows in %s to federal elementary flow list",
                 from_fba_source)
        if 'fedefl_mapping' in v:
            mapping_files = v['fedefl_mapping']
            ignore_source_name = True
        else:
            mapping_files = from_fba_source
        flow_type = 'ELEMENTARY_FLOW'

    fbs_mapped = map_flows(fbs, mapping_files, flow_type, ignore_source_name,
                           **kwargs)

    return fbs_mapped, mapping_files
Exemple #2
0
def harmonize_FBS_columns(df):
    """
    For FBS use in USEEIOR, harmonize the values in the columns
    - LocationSystem: drop the year, so just 'FIPS'
    - MeasureofSpread: tmp set to NoneType as values currently misleading
    - Spread: tmp set to 0 as values currently misleading
    - DistributionType: tmp set to NoneType as values currently misleading
    - MetaSources: Combine strings for rows where
            class/context/flowtype/flowable/etc. are equal
    :param df: FBS dataframe with mixed values/strings in columns
    :return: FBS df with harmonized values/strings in columns
    """

    # harmonize LocationSystem column
    log.info('Drop year in LocationSystem')
    if df['LocationSystem'].str.contains('FIPS').all():
        df = df.assign(LocationSystem='FIPS')
    # harmonize MeasureofSpread
    log.info('Reset MeasureofSpread to NoneType')
    df = df.assign(MeasureofSpread=None)
    # reset spread, as current values are misleading
    log.info('Reset Spread to 0')
    df = df.assign(Spread=0)
    # harmonize Distributiontype
    log.info('Reset DistributionType to NoneType')
    df = df.assign(DistributionType=None)

    # harmonize metasources
    log.info('Harmonize MetaSources')
    df = replace_NoneType_with_empty_cells(df)

    # subset all string cols of the df and drop duplicates
    string_cols = [
        'Flowable', 'Class', 'SectorProducedBy', 'SectorConsumedBy',
        'SectorSourceName', 'Context', 'Location', 'LocationSystem', 'Unit',
        'FlowType', 'Year', 'MeasureofSpread', 'MetaSources'
    ]
    df_sub = df[string_cols].drop_duplicates().reset_index(drop=True)
    # sort df
    df_sub = df_sub.sort_values(
        ['MetaSources', 'SectorProducedBy',
         'SectorConsumedBy']).reset_index(drop=True)

    # new group cols
    group_no_meta = [e for e in string_cols if e not in 'MetaSources']

    # combine/sum columns that share the same data other than Metasources,
    # combining MetaSources string in process
    df_sub = df_sub.groupby(group_no_meta)['MetaSources'].apply(
        ', '.join).reset_index()
    # drop the MetaSources col in original df and replace with the
    # MetaSources col in df_sub
    df = df.drop(columns='MetaSources')
    harmonized_df = df.merge(df_sub, how='left')
    harmonized_df = replace_strings_with_NoneType(harmonized_df)

    return harmonized_df
Exemple #3
0
def writeFlowBySectorBibliography(methodname):
    """
    Generate bibliography for FlowBySectorMethod in local directory
    :param methodname: string, FBS methodname for which to create .bib file
    :return: .bib file save to local directory
    """
    # Generate a single .bib file for a list of Flow-By-Sector method names
    # and save file to local directory
    log.info('Write bibliography to %s%s.bib', biboutputpath, methodname)
    generate_fbs_bibliography(methodname)
Exemple #4
0
def reassign_airplane_emissions(df, year, NAICS_level_value):
    """
    Reassigns emissions from airplanes to NAICS associated with air
    transportation instead of the NAICS assigned to airports
    :param df: a dataframe of emissions and mapped faciliites from stewicombo
    :param year: year as str
    :param NAICS_level_value: desired NAICS aggregation level,
        using sector_level_key, should match target_sector_level
    :return: df
    """
    import stewi
    from stewicombo.overlaphandler import remove_default_flow_overlaps
    from stewicombo.globals import addChemicalMatches

    # subtract emissions for air transportation from airports in NEI
    airport_NAICS = '4881'
    air_transportation_SCC = '2275020000'
    air_transportation_naics = '481111'
    log.info('Reassigning emissions from air transportation from airports')

    # obtain and prepare SCC dataset
    df_airplanes = stewi.getInventory('NEI', year, stewiformat='flowbyprocess')
    df_airplanes = df_airplanes[df_airplanes['Process'] ==
                                air_transportation_SCC]
    df_airplanes['Source'] = 'NEI'
    df_airplanes = addChemicalMatches(df_airplanes)
    df_airplanes = remove_default_flow_overlaps(df_airplanes, SCC=True)
    df_airplanes.drop(columns=['Process'], inplace=True)

    facility_mapping_air = df[['FacilityID', 'NAICS']]
    facility_mapping_air.drop_duplicates(keep='first', inplace=True)
    df_airplanes = df_airplanes.merge(facility_mapping_air,
                                      how='left',
                                      on='FacilityID')

    df_airplanes['Year'] = year
    df_airplanes = df_airplanes[df_airplanes['NAICS'].str[0:len(airport_NAICS)]
                                == airport_NAICS]

    # subtract airplane emissions from airport NAICS at individual facilities
    df_planeemissions = df_airplanes[['FacilityID', 'FlowName', 'FlowAmount']]
    df_planeemissions.rename(columns={'FlowAmount': 'PlaneEmissions'},
                             inplace=True)
    df = df.merge(df_planeemissions, how='left', on=['FacilityID', 'FlowName'])
    df[['PlaneEmissions']] = df[['PlaneEmissions']].fillna(value=0)
    df['FlowAmount'] = df['FlowAmount'] - df['PlaneEmissions']
    df.drop(columns=['PlaneEmissions'], inplace=True)

    # add airplane emissions under air transport NAICS
    df_airplanes.loc[:, 'NAICS_lvl'] = \
        air_transportation_naics[0:NAICS_level_value]
    df = pd.concat([df, df_airplanes], ignore_index=True)

    return df
Exemple #5
0
def generate_list_of_sources_in_fbs_method(methodname):
    """
    Determine what FlowByActivities are used to generate a FlowBySector
    :param methodname: string, FlowBySector method
    :return: list, pairs of FlowByActivity source names and years
    """
    sources = []
    # load the fbs method yaml
    fbs_yaml = load_yaml_dict(methodname, flowbytype='FBS')

    # create list of data and allocation data sets
    fbs = fbs_yaml['source_names']
    for fbs_k, fbs_v in fbs.items():
        try:
            sources.append([fbs_k, fbs_v['year']])
        except KeyError:
            log.info(
                'Could not append %s to datasource '
                'list because missing year', fbs_k)
            continue
        activities = fbs_v['activity_sets']
        for aset, attr in activities.items():
            if attr['allocation_source'] != 'None':
                sources.append([
                    attr['allocation_source'], attr['allocation_source_year']
                ])
            if 'helper_source' in attr:
                sources.append(
                    [attr['helper_source'], attr['helper_source_year']])
            if 'literature_sources' in attr:
                for source, date in attr['literature_sources'].items():
                    sources.append([source, date])
    # load any additional fbas that are called in a fbs method within fxns
    try:
        fbas = load_fbs_methods_additional_fbas_config()[methodname]
        for s, acts_info in fbas.items():
            for acts, fxn_info in acts_info.items():
                for fxn, fba_info in fxn_info.items():
                    for fba, y in fba_info.items():
                        fxn_config = \
                            load_functions_loading_fbas_config()[fxn][fba]
                        sources.append([fxn_config['source'], y])
    except KeyError:
        # if no additional fbas than pass
        log.info(
            f'There are no additional Flow-By-Activities '
            'used in generating %s', methodname)
        pass

    return sources
Exemple #6
0
def compare_df_units(df1_load, df2_load):
    """
    Determine what units are in each df prior to merge
    :param df1_load:
    :param df2_load:
    :return:
    """
    df1 = df1_load['Unit'].drop_duplicates().tolist()
    df2 = df2_load['Unit'].drop_duplicates().tolist()

    # identify differnces between unit lists
    list_comp = list(set(df1) ^ set(df2))
    # if list is not empty, print warning that units are different
    if list_comp:
        log.info('Merging df with %s and df with %s units', df1, df2)
Exemple #7
0
def usgs_myb_year(years, current_year_str):
    """
    Sets the column for the string based on the year. Checks that the year
    you picked is in the last file.
    :param years: string, with hypthon
    :param current_year_str: string, year of interest
    :return: string, year
    """
    years_array = years.split("-")
    lower_year = int(years_array[0])
    upper_year = int(years_array[1])
    current_year = int(current_year_str)
    if lower_year <= current_year <= upper_year:
        column_val = current_year - lower_year + 1
        return "year_" + str(column_val)
    else:
        log.info("Your year is out of scope. Pick a year between %s and %s",
                 lower_year, upper_year)
Exemple #8
0
def function_allocation_method(flow_subset_mapped, k, names, attr, fbs_list):
    """
    Allocate df activities to sectors using a function identified
    in the FBS method yaml
    :param flow_subset_mapped: df, FBA with flows converted using
        fedelemflowlist
    :param k: str, source name
    :param names: list, activity names in activity set
    :param attr: dictionary, attribute data from method yaml for activity set
    :param fbs_list: list, fbs dfs created running flowbysector.py
    :return: df, FBS, with allocated activity columns to sectors
    """
    log.info(
        'Calling on function specified in method yaml to allocate '
        '%s to sectors', ', '.join(map(str, names)))
    fbs = dynamically_import_fxn(k,
                                 attr['allocation_source'])(flow_subset_mapped,
                                                            attr, fbs_list)
    return fbs
Exemple #9
0
def direct_allocation_method(fbs, k, names, method):
    """
    Directly assign activities to sectors
    :param fbs: df, FBA with flows converted using fedelemflowlist
    :param k: str, source name
    :param names: list, activity names in activity set
    :param method: dictionary, FBS method yaml
    :return: df with sector columns
    """
    log.info('Directly assigning activities to sectors')
    # for each activity, if activities are not sector like,
    # check that there is no data loss
    if check_activities_sector_like(k) is False:
        activity_list = []
        n_allocated = []
        for n in names:
            # avoid double counting by dropping n from the df after calling on
            # n, in the event both ACB and APB values exist
            fbs = fbs[~(
                (fbs[fba_activity_fields[0]].isin(n_allocated)) |
                (fbs[fba_activity_fields[1]].isin(n_allocated)))].reset_index(
                    drop=True)
            log.debug('Checking for %s at %s', n,
                      method['target_sector_level'])
            fbs_subset = \
                fbs[(fbs[fba_activity_fields[0]] == n) |
                    (fbs[fba_activity_fields[1]] == n)].reset_index(drop=True)
            # check if an Activity maps to more than one sector,
            # if so, equally allocate
            fbs_subset = equal_allocation(fbs_subset)
            fbs_subset = equally_allocate_parent_to_child_naics(
                fbs_subset, method['target_sector_level'])
            activity_list.append(fbs_subset)
            n_allocated.append(n)
        fbs = pd.concat(activity_list, ignore_index=True)
    return fbs
Exemple #10
0
def getFlowByActivity(datasource,
                      year,
                      flowclass=None,
                      geographic_level=None,
                      download_FBA_if_missing=DEFAULT_DOWNLOAD_IF_MISSING):
    """
    Retrieves stored data in the FlowByActivity format
    :param datasource: str, the code of the datasource.
    :param year: int, a year, e.g. 2012
    :param flowclass: str, a 'Class' of the flow. Optional. E.g. 'Water'
    :param geographic_level: str, a geographic level of the data.
                             Optional. E.g. 'national', 'state', 'county'.
    :param download_FBA_if_missing: bool, if True will attempt to load from
        remote server prior to generating if file not found locally
    :return: a pandas DataFrame in FlowByActivity format
    """
    # Set fba metadata
    name = flowsa.flowbyactivity.set_fba_name(datasource, year)
    fba_meta = set_fb_meta(name, "FlowByActivity")

    # Try to load a local version of FBA
    fba = load_preprocessed_output(fba_meta, paths)
    # If that didn't work, try to download a remote version of FBA
    if fba is None and download_FBA_if_missing:
        log.info('%s %s not found in %s, downloading from remote source',
                 datasource, str(year), fbaoutputpath)
        download_from_remote(fba_meta, paths)
        fba = load_preprocessed_output(fba_meta, paths)
    # If that didn't work or wasn't allowed, try to construct the FBA
    if fba is None:
        log.info('%s %s not found in %s, running functions to generate FBA',
                 datasource, str(year), fbaoutputpath)
        # Generate the fba
        flowsa.flowbyactivity.main(year=year, source=datasource)
        # Now load the fba
        fba = load_preprocessed_output(fba_meta, paths)
    # If none of the above worked, log an error message
    if fba is None:
        log.error('getFlowByActivity failed, FBA not found')
    # Otherwise (that is, if one of the above methods successfuly loaded the
    # FBA), log it.
    else:
        log.info('Loaded %s %s from %s', datasource, str(year), fbaoutputpath)

    # Address optional parameters
    if flowclass is not None:
        fba = fba[fba['Class'] == flowclass]
    # if geographic level specified, only load rows in geo level
    if geographic_level is not None:
        fba = filter_by_geoscale(fba, geographic_level)
    return fba
Exemple #11
0
def getFlowBySector(methodname,
                    download_FBAs_if_missing=DEFAULT_DOWNLOAD_IF_MISSING,
                    download_FBS_if_missing=DEFAULT_DOWNLOAD_IF_MISSING):
    """
    Loads stored FlowBySector output or generates it if it doesn't exist,
    then loads
    :param methodname: string, Name of an available method for the given class
    :param download_FBAs_if_missing: bool, if True will attempt to load FBAS
        used in generating the FBS from remote server prior to generating if
        file not found locally
    :param download_FBS_if_missing: bool, if True will attempt to load from
        remote server prior to generating if file not found locally
    :return: dataframe in flow by sector format
    """
    fbs_meta = set_fb_meta(methodname, "FlowBySector")
    # Try to load a local version of the FBS
    fbs = load_preprocessed_output(fbs_meta, paths)
    # If that didn't work, try to download a remote version of FBS
    if fbs is None and download_FBS_if_missing:
        log.info('%s not found in %s, downloading from remote source',
                 methodname, fbsoutputpath)
        # download and load the FBS parquet
        subdirectory_dict = {'.log': 'Log'}
        download_from_remote(fbs_meta,
                             paths,
                             subdirectory_dict=subdirectory_dict)
        fbs = load_preprocessed_output(fbs_meta, paths)
    # If that didn't work or wasn't allowed, try to construct the FBS
    if fbs is None:
        log.info('%s not found in %s, running functions to generate FBS',
                 methodname, fbsoutputpath)
        # Generate the fbs, with option to download any required FBAs from
        # Data Commons
        flowsa.flowbysector.main(
            method=methodname,
            download_FBAs_if_missing=download_FBAs_if_missing)
        # Now load the fbs
        fbs = load_preprocessed_output(fbs_meta, paths)
    # If none of the above worked, log an error message
    if fbs is None:
        log.error('getFlowBySector failed, FBS not found')
    # Otherwise (that is, if one of the above methods successfuly loaded the
    # FBS), log it.
    else:
        log.info('Loaded %s from %s', methodname, fbsoutputpath)
    return fbs
Exemple #12
0
def check_for_missing_sector_data(df, target_sector_level):
    """
    Modeled after validation.py check_if_losing_sector_data
    Allocates flow amount equally across child NAICS when parent NAICS
    is not target_level
    :param df: df
    :param target_sector_level: str, final sector level of FBS (ex. NAICS_6)
    :return: df with missing sector level data
    """

    from flowsa.dataclean import replace_NoneType_with_empty_cells
    from flowsa.dataclean import replace_strings_with_NoneType

    # temporarily replace null values with empty cells
    df = replace_NoneType_with_empty_cells(df)

    activity_field = "SectorProducedBy"
    rows_lost = pd.DataFrame()
    cw_load = load_crosswalk('sector_length')
    for i in range(3, sector_level_key[target_sector_level]):
        # create df of i length
        df_subset = df.loc[df[activity_field].apply(lambda x: len(x) == i)]

        # import cw and subset to current sector length and
        # target sector length

        nlength = list(sector_level_key.keys())[list(
            sector_level_key.values()).index(i)]
        cw = cw_load[[nlength, target_sector_level]].drop_duplicates()
        # add column with counts
        cw['sector_count'] = cw.groupby(nlength)[nlength].transform('count')

        # merge df & replace sector produced columns
        df_x = pd.merge(df_subset,
                        cw,
                        how='left',
                        left_on=[activity_field],
                        right_on=[nlength])
        df_x[activity_field] = df_x[target_sector_level]
        df_x = df_x.drop(columns=[nlength, target_sector_level])

        # calculate new flow amounts, based on sector count,
        # allocating equally to the new sector length codes
        df_x['FlowAmount'] = df_x['FlowAmount'] / df_x['sector_count']
        df_x = df_x.drop(columns=['sector_count'])
        # replace null values with empty cells
        df_x = replace_NoneType_with_empty_cells(df_x)

        # append to df
        sector_list = df_subset[activity_field].drop_duplicates()
        if len(df_x) != 0:
            log.warning(
                'Data found at %s digit NAICS to be allocated: '
                '{}'.format(' '.join(map(str, sector_list))), str(i))
            rows_lost = rows_lost.append(df_x, ignore_index=True, sort=True)

    if len(rows_lost) == 0:
        log.info('No data loss from NAICS in dataframe')
    else:
        log.info('Allocating FlowAmounts equally to each %s',
                 target_sector_level)

    # add rows of missing data to the fbs sector subset
    df_allocated = pd.concat([df, rows_lost], ignore_index=True, sort=True)
    df_allocated = df_allocated.loc[df_allocated[activity_field].apply(
        lambda x: len(x) == sector_level_key[target_sector_level])]
    df_allocated.reset_index(inplace=True)

    # replace empty cells with NoneType (if dtype is object)
    df_allocated = replace_strings_with_NoneType(df_allocated)

    return df_allocated
Exemple #13
0
def generate_fbs_bibliography(methodname):
    """
    Generate bibliography for a FlowBySector
    :param methodname: string, methodname to create a bibliiography
    :return: a .bib file saved in local directory
    """

    from flowsa.metadata import getMetadata

    # create list of sources in method
    sources = generate_list_of_sources_in_fbs_method(methodname)

    # loop through list of sources, load source method
    # yaml, and create bib entry
    bib_list = []
    source_set = set()
    for source in sources:
        # drop list duplicates and any where year is None (because allocation
        # is a function, not a datasource)
        if source[1] != 'None':
            try:
                config = \
                    load_values_from_literature_citations_config()[source[0]]
            except KeyError:
                try:
                    config = getMetadata(source[0], source[1])
                    # flatten the dictionary so can treat all
                    # dictionaries the same when pulling info
                    config = pd.json_normalize(config, sep='_')
                    config.columns = \
                        config.columns.str.replace('tool_meta_', '')
                    config = config.to_dict(orient='records')[0]
                except KeyError or AttributeError:
                    log.info('Could not find metadata for %s', source[0])
                    continue
            if config is not None:
                # ensure data sources are not duplicated
                # when different source names
                try:
                    if (config['source_name'], config['author'], source[1],
                            config['source_url']) not in source_set:
                        source_set.add(
                            (config['source_name'], config['author'],
                             source[1], config['source_url']))

                        # if there is a date downloaded, use in
                        # citation over date generated
                        if 'original_data_download_date' in config:
                            bib_date = config['original_data_download_date']
                        elif 'date_accessed' in config:
                            bib_date = config['date_accessed']
                        else:
                            bib_date = config['date_created']

                        db = BibDatabase()
                        db.entries = [{
                            'title':
                            f"{config['source_name']} "
                            f"{str(source[1])}",
                            'author':
                            config['author'],
                            'year':
                            str(source[1]),
                            'url':
                            config['source_url'],
                            'urldate':
                            bib_date,
                            'ID':
                            config['bib_id'] + '_' + str(source[1]),
                            'ENTRYTYPE':
                            'misc'
                        }]
                        # append each entry to a list of BibDatabase entries
                        bib_list.append(db)
                except KeyError:
                    log.exception(
                        'Missing information needed to '
                        'create bib for %s, %s', source[0], source[1])
                    continue

    # write out bibliography
    writer = BibTexWriter()
    # create directory if missing
    os.makedirs(outputpath + '/Bibliography', exist_ok=True)
    with open(f'{biboutputpath}{methodname}.bib', 'w') as bibfile:
        # loop through all entries in bib_list
        for b in bib_list:
            bibfile.write(writer.write(b))
Exemple #14
0
def subset_df_by_geoscale(df, activity_from_scale, activity_to_scale):
    """
    Subset a df by geoscale or agg to create data specified in method yaml
    :param df: df, FBA format
    :param activity_from_scale: str, identified geoscale by which to subset or
                                aggregate from ('national', 'state', 'county')
    :param activity_to_scale: str, identified geoscale by which to subset or
                              aggregate to ('national', 'state', 'county')
    :return: df, FBA, subset or aggregated to a single geoscale for all rows
    """

    # detect grouping cols by columns
    if 'Context' in df.columns:
        groupbycols = fba_mapped_default_grouping_fields
        cols_to_keep = flow_by_activity_mapped_fields
    else:
        groupbycols = fba_default_grouping_fields
        cols_to_keep = flow_by_activity_fields

    # method of subset dependent on LocationSystem
    if df['LocationSystem'].str.contains('FIPS').any():
        df = df[df['LocationSystem'].str.contains('FIPS')].reset_index(
            drop=True)
        # determine 'activity_from_scale' for use in df
        # geoscale subset, by activity
        modified_from_scale = \
            return_activity_from_scale(df, activity_from_scale)
        # add 'activity_from_scale' column to df
        df2 = pd.merge(df, modified_from_scale)

        # list of unique 'from' geoscales
        unique_geoscales = modified_from_scale[
            'activity_from_scale'].drop_duplicates().values.tolist()
        if len(unique_geoscales) > 1:
            log.info('Dataframe has a mix of geographic levels: %s',
                     ', '.join(unique_geoscales))

        # to scale
        if fips_number_key[activity_from_scale] > \
                fips_number_key[activity_to_scale]:
            to_scale = activity_to_scale
        else:
            to_scale = activity_from_scale

        df_subset_list = []
        # subset df based on activity 'from' scale
        for i in unique_geoscales:
            df3 = df2[df2['activity_from_scale'] == i]
            # if desired geoscale doesn't exist, aggregate existing data
            # if df is less aggregated than allocation df, aggregate
            # fba activity to allocation geoscale
            if fips_number_key[i] > fips_number_key[to_scale]:
                log.info("Aggregating subset from %s to %s", i, to_scale)
                df_sub = agg_by_geoscale(df3, i, to_scale, groupbycols)
            # else filter relevant rows
            else:
                log.info("Subsetting %s data", i)
                df_sub = filter_by_geoscale(df3, i)
            df_subset_list.append(df_sub)
        df_subset = pd.concat(df_subset_list, ignore_index=True)

        # drop unused columns
        df_subset = clean_df(df_subset,
                             cols_to_keep,
                             fba_fill_na_dict,
                             drop_description=False)

        return df_subset

    # right now, the only other location system is for Statistics Canada data
    else:
        return df
Exemple #15
0
def dataset_allocation_method(flow_subset_mapped, attr, names, method, k, v,
                              aset, aset_names, download_FBA_if_missing):
    """
    Method of allocation using a specified data source
    :param flow_subset_mapped: FBA subset mapped using federal
        elementary flow list
    :param attr: dictionary, attribute data from method yaml for activity set
    :param names: list, activity names in activity set
    :param method: dictionary, FBS method yaml
    :param k: str, the datasource name
    :param v: dictionary, the datasource parameters
    :param aset: dictionary items for FBS method yaml
    :param aset_names: list, activity set names
    :param download_FBA_if_missing: bool, indicate if missing FBAs
       should be downloaded from Data Commons
    :return: df, allocated activity names
    """

    from flowsa.validation import compare_df_units

    # add parameters to dictionary if exist in method yaml
    fba_dict = {}
    if 'allocation_flow' in attr:
        fba_dict['flowname_subset'] = attr['allocation_flow']
    if 'allocation_compartment' in attr:
        fba_dict['compartment_subset'] = attr['allocation_compartment']
    if 'clean_allocation_fba' in attr:
        fba_dict['clean_fba'] = attr['clean_allocation_fba']
    if 'clean_allocation_fba_w_sec' in attr:
        fba_dict['clean_fba_w_sec'] = attr['clean_allocation_fba_w_sec']

    # load the allocation FBA
    fba_allocation_wsec = \
        load_map_clean_fba(method, attr,
                           fba_sourcename=attr['allocation_source'],
                           df_year=attr['allocation_source_year'],
                           flowclass=attr['allocation_source_class'],
                           geoscale_from=attr['allocation_from_scale'],
                           geoscale_to=v['geoscale_to_use'],
                           download_FBA_if_missing=download_FBA_if_missing,
                           **fba_dict)

    # subset fba datasets to only keep the sectors associated
    # with activity subset
    log.info("Subsetting %s for sectors in %s", attr['allocation_source'], k)
    fba_allocation_subset = \
        get_fba_allocation_subset(fba_allocation_wsec, k, names,
                                  flowSubsetMapped=flow_subset_mapped,
                                  allocMethod=attr['allocation_method'])

    # if there is an allocation helper dataset, modify allocation df
    if 'helper_source' in attr:
        log.info("Using the specified allocation help for subset of %s",
                 attr['allocation_source'])
        fba_allocation_subset = \
            allocation_helper(fba_allocation_subset, attr, method, v,
                              download_FBA_if_missing=download_FBA_if_missing)

    # create flow allocation ratios for each activity
    flow_alloc_list = []
    if 'Context' in fba_allocation_subset.columns:
        group_cols = fba_mapped_wsec_default_grouping_fields
    else:
        group_cols = fba_wsec_default_grouping_fields
    group_cols = [
        e for e in group_cols
        if e not in ('ActivityProducedBy', 'ActivityConsumedBy')
    ]
    n_allocated = []
    for n in names:
        log.debug("Creating allocation ratios for %s", n)
        # if n has already been called, drop all rows of data
        # containing n to avoid double counting when there are two
        # activities in each ACB and APB columns
        fba_allocation_subset = fba_allocation_subset[~(
            (fba_allocation_subset[fba_activity_fields[0]].isin(n_allocated)) |
            (fba_allocation_subset[fba_activity_fields[1]].isin(n_allocated))
        )].reset_index(drop=True)
        fba_allocation_subset_2 = \
            get_fba_allocation_subset(fba_allocation_subset, k, [n],
                                      flowSubsetMapped=flow_subset_mapped,
                                      allocMethod=attr['allocation_method'],
                                      activity_set_names=aset_names)
        if len(fba_allocation_subset_2) == 0:
            log.info("No data found to allocate %s", n)
        else:
            flow_alloc = \
                allocate_by_sector(fba_allocation_subset_2, attr,
                                   attr['allocation_method'], group_cols,
                                   flowSubsetMapped=flow_subset_mapped)
            flow_alloc = flow_alloc.assign(FBA_Activity=n)
            n_allocated.append(n)
            flow_alloc_list.append(flow_alloc)
    flow_allocation = pd.concat(flow_alloc_list, ignore_index=True)

    # generalize activity field names to enable link to main fba source
    log.info("Generalizing activity columns in subset of %s",
             attr['allocation_source'])
    flow_allocation = collapse_activity_fields(flow_allocation)

    # check for issues with allocation ratios
    check_allocation_ratios(flow_allocation, aset, method, attr)

    # create list of sectors in the flow allocation df,
    # drop any rows of data in the flow df that aren't in list
    sector_list = flow_allocation['Sector'].unique().tolist()

    # subset fba allocation table to the values in the activity
    # list, based on overlapping sectors
    flow_subset_mapped = flow_subset_mapped.loc[
        (flow_subset_mapped[fbs_activity_fields[0]].isin(sector_list)) |
        (flow_subset_mapped[fbs_activity_fields[1]].isin(sector_list))]

    # check if fba and allocation dfs have the same LocationSystem
    log.info("Checking if flowbyactivity and allocation "
             "dataframes use the same location systems")
    check_if_location_systems_match(flow_subset_mapped, flow_allocation)

    # merge fba df w/flow allocation dataset
    log.info("Merge %s and subset of %s", k, attr['allocation_source'])
    for i, j in activity_fields.items():
        # check units
        compare_df_units(flow_subset_mapped, flow_allocation)
        # create list of columns to merge on
        if 'allocation_merge_columns' in attr:
            fa_cols = \
                ['Location', 'Sector', 'FlowAmountRatio', 'FBA_Activity'] + \
                attr['allocation_merge_columns']
            l_cols = \
                ['Location', j[1]["flowbysector"], j[0]["flowbyactivity"]] + \
                attr['allocation_merge_columns']
            r_cols = ['Location', 'Sector', 'FBA_Activity'] + \
                     attr['allocation_merge_columns']
        else:
            fa_cols = ['Location', 'Sector', 'FlowAmountRatio', 'FBA_Activity']
            l_cols = ['Location', j[1]["flowbysector"], j[0]["flowbyactivity"]]
            r_cols = ['Location', 'Sector', 'FBA_Activity']
        flow_subset_mapped = \
            flow_subset_mapped.merge(flow_allocation[fa_cols], left_on=l_cols,
                                     right_on=r_cols, how='left')

    # merge the flowamount columns
    flow_subset_mapped.loc[:, 'FlowAmountRatio'] =\
        flow_subset_mapped['FlowAmountRatio_x'].fillna(
            flow_subset_mapped['FlowAmountRatio_y'])
    # fill null rows with 0 because no allocation info
    flow_subset_mapped['FlowAmountRatio'] = \
        flow_subset_mapped['FlowAmountRatio'].fillna(0)

    # drop rows where there is no allocation data
    fbs = flow_subset_mapped.dropna(subset=['Sector_x', 'Sector_y'],
                                    how='all').reset_index()

    # calculate flow amounts for each sector
    log.info("Calculating new flow amounts using flow ratios")
    fbs.loc[:, 'FlowAmount'] = fbs['FlowAmount'] * fbs['FlowAmountRatio']

    # drop columns
    log.info("Cleaning up new flow by sector")
    fbs = fbs.drop(columns=[
        'Sector_x', 'FlowAmountRatio_x', 'Sector_y', 'FlowAmountRatio_y',
        'FlowAmountRatio', 'FBA_Activity_x', 'FBA_Activity_y'
    ])
    return fbs
Exemple #16
0
def reset_fbs_dq_scores(df):
    """
    Set all Data Quality Scores to None
    :param df: FBS dataframe with mixed values/strings in columns
    :return: FBS df with the DQ scores set to null
    """

    # reset spread, as current values are misleading
    log.info('Reset Spread to None')
    df = df.assign(Spread=None)
    # reset min, as current values are misleading
    log.info('Reset Min to None')
    df = df.assign(Min=None)
    # reset min, as current values are misleading
    log.info('Reset Max to None')
    df = df.assign(Max=None)
    # reset DR, as current values are misleading
    log.info('Reset DataReliability to None')
    df = df.assign(DataReliability=None)
    # reset TC, as current values are misleading
    log.info('Reset TemporalCorrelation to None')
    df = df.assign(TemporalCorrelation=None)
    # reset GC, as current values are misleading
    log.info('Reset GeographicalCorrelation to None')
    df = df.assign(GeographicalCorrelation=None)
    # reset TC, as current values are misleading
    log.info('Reset TechnologicalCorrelation to None')
    df = df.assign(TechnologicalCorrelation=None)
    # reset DC, as current values are misleading
    log.info('Reset DataCollection to None')
    df = df.assign(DataCollection=None)

    return df
Exemple #17
0
def load_map_clean_fba(method, attr, fba_sourcename, df_year, flowclass,
                       geoscale_from, geoscale_to, **kwargs):
    """
    Load, clean, and map a FlowByActivity df
    :param method: dictionary, FBS method yaml
    :param attr: dictionary, attribute data from method yaml for activity set
    :param fba_sourcename: str, source name
    :param df_year: str, year
    :param flowclass: str, flowclass to subset df with
    :param geoscale_from: str, geoscale to use
    :param geoscale_to: str, geoscale to aggregate to
    :param kwargs: dictionary, can include parameters: 'allocation_flow',
                   'allocation_compartment','clean_allocation_fba',
                   'clean_allocation_fba_w_sec'
    :return: df, fba format
    """
    # dictionary to load/standardize fba
    kwargs_dict = {}
    if 'download_FBA_if_missing' in kwargs:
        kwargs_dict['download_FBA_if_missing'] = \
            kwargs['download_FBA_if_missing']
    if 'allocation_map_to_flow_list' in attr:
        kwargs_dict['allocation_map_to_flow_list'] = \
            attr['allocation_map_to_flow_list']

    log.info("Loading allocation flowbyactivity %s for year %s",
             fba_sourcename, str(df_year))
    fba = load_fba_w_standardized_units(datasource=fba_sourcename,
                                        year=df_year,
                                        flowclass=flowclass,
                                        **kwargs_dict)

    # check if allocation data exists at specified geoscale to use
    log.info("Checking if allocation data exists at the %s level",
             geoscale_from)
    check_if_data_exists_at_geoscale(fba, geoscale_from)

    # aggregate geographically to the scale of the flowbyactivty source,
    # if necessary
    fba = subset_df_by_geoscale(fba, geoscale_from, geoscale_to)

    # subset based on yaml settings
    if 'flowname_subset' in kwargs:
        if kwargs['flowname_subset'] != 'None':
            fba = fba.loc[fba['FlowName'].isin(kwargs['flowname_subset'])]
    if 'compartment_subset' in kwargs:
        if kwargs['compartment_subset'] != 'None':
            fba = \
                fba.loc[fba['Compartment'].isin(kwargs['compartment_subset'])]

    # cleanup the fba allocation df, if necessary
    if 'clean_fba' in kwargs:
        log.info("Cleaning %s", fba_sourcename)
        fba = dynamically_import_fxn(fba_sourcename, kwargs["clean_fba"])(
            fba,
            attr=attr,
            download_FBA_if_missing=kwargs['download_FBA_if_missing'])
    # reset index
    fba = fba.reset_index(drop=True)

    # assign sector to allocation dataset
    log.info("Adding sectors to %s", fba_sourcename)
    fba_wsec = add_sectors_to_flowbyactivity(
        fba, sectorsourcename=method['target_sector_source'])

    # call on fxn to further clean up/disaggregate the fba
    # allocation data, if exists
    if 'clean_fba_w_sec' in kwargs:
        log.info("Further disaggregating sectors in %s", fba_sourcename)
        fba_wsec = dynamically_import_fxn(
            fba_sourcename, kwargs['clean_fba_w_sec'])(
                fba_wsec,
                attr=attr,
                method=method,
                sourcename=fba_sourcename,
                download_FBA_if_missing=kwargs['download_FBA_if_missing'])

    return fba_wsec
Exemple #18
0
def stewicombo_to_sector(yaml_load):
    """
    Returns emissions from stewicombo in fbs format, requires stewi >= 0.9.5
    :param yaml_load: which may contain the following elements:
        local_inventory_name: (optional) a string naming the file from which to
                source a pregenerated stewicombo file stored locally (e.g.,
                'CAP_HAP_national_2017_v0.9.7_5cf36c0.parquet' or
                'CAP_HAP_national_2017')
        inventory_dict: a dictionary of inventory types and years (e.g.,
                {'NEI':'2017', 'TRI':'2017'})
        NAICS_level: desired NAICS aggregation level, using sector_level_key,
                should match target_sector_level
        geo_scale: desired geographic aggregation level ('national', 'state',
                'county'), should match target_geoscale
        compartments: list of compartments to include (e.g., 'water', 'air',
                'soil'), use None to include all compartments
        functions: list of functions (str) to call for additional processing
    :return: df, FBS format
    """

    import stewicombo
    from flowsa.data_source_scripts.EPA_NEI import drop_GHGs

    # determine if fxns specified in FBS method yaml
    if 'functions' not in yaml_load:
        functions = []
    else:
        functions = yaml_load['functions']

    if 'local_inventory_name' in yaml_load:
        inventory_name = yaml_load['local_inventory_name']
    else:
        inventory_name = None

    NAICS_level_value = sector_level_key[yaml_load['NAICS_level']]

    df = None
    if inventory_name is not None:
        df = stewicombo.getInventory(inventory_name, True)
    if df is None:
        # run stewicombo to combine inventories, filter for LCI, remove overlap
        log.info('generating inventory in stewicombo')
        df = stewicombo.combineFullInventories(
            yaml_load['inventory_dict'],
            filter_for_LCI=True,
            remove_overlap=True,
            compartments=yaml_load['compartments'])

    if df is None:
        # Inventories not found for stewicombo, return empty FBS
        return None

    df.drop(columns=['SRS_CAS', 'SRS_ID', 'FacilityIDs_Combined'],
            inplace=True)

    inventory_list = list(yaml_load['inventory_dict'].keys())

    if 'drop_GHGs' in functions:
        df = drop_GHGs(df)
        functions.remove('drop_GHGs')
    facility_mapping = extract_facility_data(yaml_load['inventory_dict'])
    # use NAICS from facility matcher so drop them here
    facility_mapping.drop(columns=['NAICS'], inplace=True)
    # merge dataframes to assign facility information based on facility IDs
    df = pd.merge(df, facility_mapping, how='left', on='FacilityID')

    all_NAICS = obtain_NAICS_from_facility_matcher(inventory_list)
    df = pd.merge(df, all_NAICS, how='left', on=['FRS_ID', 'Source'])

    # add levelized NAICS code prior to aggregation
    df['NAICS_lvl'] = df['NAICS'].str[0:NAICS_level_value]

    if 'reassign_airplane_emissions' in functions:
        df = reassign_airplane_emissions(df,
                                         yaml_load['inventory_dict']['NEI'],
                                         NAICS_level_value)
        functions.remove('reassign_airplane_emissions')

    df['MetaSources'] = df['Source']

    fbs = prepare_stewi_fbs(df, yaml_load['inventory_dict'],
                            yaml_load['NAICS_level'], yaml_load['geo_scale'])

    for function in functions:
        fbs = getattr(sys.modules[__name__], function)(fbs)

    return fbs
Exemple #19
0
def compare_activity_to_sector_flowamounts(fba_load, fbs_load, activity_set,
                                           source_name, config):
    """
    Function to compare the loaded flowbyactivity with the final flowbysector
    by activityname (if exists) to target sector level
    output, checking for data loss
    :param fba_load: df, FBA loaded and mapped using FEDEFL
    :param fbs_load: df, final FBS df
    :param activity_set: str, activity set
    :param source_name: str, source name
    :param config: dictionary, method yaml
    :return: printout data differences between loaded FBA and FBS output,
             save results as csv in local directory
    """
    if check_activities_sector_like(source_name):
        vLog.debug('Not comparing loaded FlowByActivity to FlowBySector '
                   'ratios for a dataset with sector-like activities because '
                   'if there are modifications to flowamounts for a sector, '
                   'then the ratios will be different')
    else:
        # subset fba df
        fba = fba_load[[
            'Class', 'MetaSources', 'Flowable', 'Unit', 'FlowType',
            'ActivityProducedBy', 'ActivityConsumedBy', 'Context', 'Location',
            'LocationSystem', 'Year', 'FlowAmount'
        ]].drop_duplicates().reset_index(drop=True)
        fba.loc[:, 'Location'] = US_FIPS
        group_cols = [
            'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit',
            'FlowType', 'Context', 'Location', 'LocationSystem', 'Year'
        ]
        fba_agg = aggregator(fba, group_cols)
        fba_agg.rename(columns={'FlowAmount': 'FBA_amount'}, inplace=True)

        # subset fbs df

        fbs = fbs_load[[
            'Class', 'SectorSourceName', 'Flowable', 'Unit', 'FlowType',
            'SectorProducedBy', 'SectorConsumedBy', 'ActivityProducedBy',
            'ActivityConsumedBy', 'Context', 'Location', 'LocationSystem',
            'Year', 'FlowAmount'
        ]].drop_duplicates().reset_index(drop=True)

        fbs = replace_NoneType_with_empty_cells(fbs)

        fbs['ProducedLength'] = fbs['SectorProducedBy'].str.len()
        fbs['ConsumedLength'] = fbs['SectorConsumedBy'].str.len()
        fbs['SectorLength'] = fbs[['ProducedLength',
                                   'ConsumedLength']].max(axis=1)
        fbs.loc[:, 'Location'] = US_FIPS
        group_cols = [
            'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit',
            'FlowType', 'Context', 'Location', 'LocationSystem', 'Year',
            'SectorLength'
        ]
        fbs_agg = aggregator(fbs, group_cols)
        fbs_agg.rename(columns={'FlowAmount': 'FBS_amount'}, inplace=True)

        # merge compare 1 and compare 2
        df_merge = fba_agg.merge(fbs_agg,
                                 left_on=[
                                     'ActivityProducedBy',
                                     'ActivityConsumedBy', 'Flowable', 'Unit',
                                     'FlowType', 'Context', 'Location',
                                     'LocationSystem', 'Year'
                                 ],
                                 right_on=[
                                     'ActivityProducedBy',
                                     'ActivityConsumedBy', 'Flowable', 'Unit',
                                     'FlowType', 'Context', 'Location',
                                     'LocationSystem', 'Year'
                                 ],
                                 how='left')
        df_merge['Ratio'] = df_merge['FBS_amount'] / df_merge['FBA_amount']

        # reorder
        df_merge = df_merge[[
            'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit',
            'FlowType', 'Context', 'Location', 'LocationSystem', 'Year',
            'SectorLength', 'FBA_amount', 'FBS_amount', 'Ratio'
        ]]

        # keep onlyrows of specified sector length
        comparison = df_merge[df_merge['SectorLength'] == sector_level_key[
            config['target_sector_level']]].reset_index(drop=True)

        tolerance = 0.01
        comparison2 = comparison[(comparison['Ratio'] < 1 - tolerance) |
                                 (comparison['Ratio'] > 1 + tolerance)]

        if len(comparison2) > 0:
            vLog.info(
                'There are %s combinations of flowable/context/sector '
                'length where the flowbyactivity to flowbysector ratio '
                'is less than or greater than 1 by %s', len(comparison2),
                str(tolerance))

        # include df subset in the validation log
        # only print rows where flowamount ratio is less t
        # han 1 (round flowamountratio)
        df_v = comparison2[comparison2['Ratio'].apply(
            lambda x: round(x, 3) < 1)].reset_index(drop=True)

        # save to validation log
        log.info(
            'Save the comparison of FlowByActivity load '
            'to FlowBySector ratios for %s in validation log', activity_set)
        # if df not empty, print, if empty, print string
        if df_v.empty:
            vLogDetailed.info('Ratios for %s all round to 1', activity_set)
        else:
            vLogDetailed.info(
                'Comparison of FlowByActivity load to '
                'FlowBySector ratios for %s: '
                '\n {}'.format(df_v.to_string()), activity_set)
Exemple #20
0
def check_allocation_ratios(flow_alloc_df_load, activity_set, config, attr):
    """
    Check for issues with the flow allocation ratios
    :param flow_alloc_df_load: df, includes 'FlowAmountRatio' column
    :param activity_set: str, activity set
    :param config: dictionary, method yaml
    :param attr: dictionary, activity set info
    :return: print out information regarding allocation ratios,
             save csv of results to local directory
    """
    # if in the attr dictionary, merge columns are identified,
    # the merge columns need to be accounted for in the grouping/checking of
    # allocation ratios
    if 'allocation_merge_columns' in attr:
        subset_cols = [
            'FBA_Activity', 'Location', 'SectorLength', 'FlowAmountRatio'
        ] + attr['allocation_merge_columns']
        groupcols = ['FBA_Activity', 'Location', 'SectorLength'
                     ] + attr['allocation_merge_columns']
    else:
        subset_cols = [
            'FBA_Activity', 'Location', 'SectorLength', 'FlowAmountRatio'
        ]
        groupcols = ['FBA_Activity', 'Location', 'SectorLength']

    # create column of sector lengths
    flow_alloc_df =\
        flow_alloc_df_load.assign(
            SectorLength=flow_alloc_df_load['Sector'].str.len())
    # subset df
    flow_alloc_df2 = flow_alloc_df[subset_cols]
    # sum the flow amount ratios by location and sector length
    flow_alloc_df3 = \
        flow_alloc_df2.groupby(
            groupcols, dropna=False, as_index=False).agg(
            {"FlowAmountRatio": sum})
    # keep only rows of specified sector length
    flow_alloc_df4 = flow_alloc_df3[
        flow_alloc_df3['SectorLength'] == sector_level_key[
            config['target_sector_level']]].reset_index(drop=True)
    # keep data where the flowamountratio is greater than or
    # less than 1 by 0.005
    tolerance = 0.01
    flow_alloc_df5 = flow_alloc_df4[
        (flow_alloc_df4['FlowAmountRatio'] < 1 - tolerance) |
        (flow_alloc_df4['FlowAmountRatio'] > 1 + tolerance)]

    if len(flow_alloc_df5) > 0:
        vLog.info(
            'There are %s instances at a sector length of %s '
            'where the allocation ratio for a location is greater '
            'than or less than 1 by at least %s. See Validation Log',
            len(flow_alloc_df5), config["target_sector_level"], str(tolerance))

    # add to validation log
    log.info(
        'Save the summary table of flow allocation ratios for each '
        'sector length for %s in validation log', activity_set)
    # if df not empty, print, if empty, print string
    if flow_alloc_df5.empty:
        vLogDetailed.info('Flow allocation ratios for %s '
                          'all round to 1', activity_set)

    else:
        vLogDetailed.info(
            'Flow allocation ratios for %s: '
            '\n {}'.format(flow_alloc_df5.to_string()), activity_set)
Exemple #21
0
def main(**kwargs):
    """
    Creates a flowbysector dataset
    :param kwargs: dictionary of arguments, only argument is
        "method_name", the name of method corresponding to flowbysector
        method yaml name
    :return: parquet, FBS save to local folder
    """
    if len(kwargs) == 0:
        kwargs = parse_args()

    method_name = kwargs['method']
    download_FBA_if_missing = kwargs.get('download_FBAs_if_missing')
    # assign arguments
    vLog.info("Initiating flowbysector creation for %s", method_name)
    # call on method
    method = load_yaml_dict(method_name, flowbytype='FBS')
    # create dictionary of data and allocation datasets
    fb = method['source_names']
    # Create empty list for storing fbs files
    fbs_list = []
    for k, v in fb.items():
        # pull fba data for allocation
        flows = load_source_dataframe(k, v, download_FBA_if_missing)

        if v['data_format'] == 'FBA':
            # ensure correct datatypes and that all fields exist
            flows = clean_df(flows,
                             flow_by_activity_fields,
                             fba_fill_na_dict,
                             drop_description=False)

            # clean up fba before mapping, if specified in yaml
            if "clean_fba_before_mapping_df_fxn" in v:
                vLog.info("Cleaning up %s FlowByActivity", k)
                flows = dynamically_import_fxn(
                    k, v["clean_fba_before_mapping_df_fxn"])(flows)

            # map flows to federal flow list or material flow list
            flows_mapped, mapping_files = \
                map_fbs_flows(flows, k, v, keep_fba_columns=True)

            # clean up fba, if specified in yaml
            if "clean_fba_df_fxn" in v:
                vLog.info("Cleaning up %s FlowByActivity", k)
                flows_mapped = dynamically_import_fxn(
                    k, v["clean_fba_df_fxn"])(flows_mapped)

            # if activity_sets are specified in a file, call them here
            if 'activity_set_file' in v:
                aset_names = pd.read_csv(flowbysectoractivitysetspath +
                                         v['activity_set_file'],
                                         dtype=str)
            else:
                aset_names = None

            # master list of activity names read in from data source
            ml_act = []
            # create dictionary of allocation datasets for different activities
            activities = v['activity_sets']
            # subset activity data and allocate to sector
            for aset, attr in activities.items():
                # subset by named activities
                if 'activity_set_file' in v:
                    names = \
                        aset_names[aset_names['activity_set'] == aset]['name']
                else:
                    names = attr['names']

                # to avoid double counting data from the same source, in
                # the event there are values in both the APB and ACB
                # columns, if an activity has already been read in and
                # allocated, remove that activity from the mapped flows
                # regardless of what activity set the data was read in
                flows_mapped = flows_mapped[~(
                    (flows_mapped[fba_activity_fields[0]].isin(ml_act)) |
                    (flows_mapped[fba_activity_fields[1]].isin(ml_act))
                )].reset_index(drop=True)
                ml_act.extend(names)

                vLog.info("Preparing to handle %s in %s", aset, k)
                # subset fba data by activity
                flows_subset = flows_mapped[
                    (flows_mapped[fba_activity_fields[0]].isin(names)) |
                    (flows_mapped[fba_activity_fields[1]].isin(names)
                     )].reset_index(drop=True)

                # subset by flowname if exists
                if 'source_flows' in attr:
                    flows_subset = flows_subset[flows_subset['FlowName'].isin(
                        attr['source_flows'])]
                if len(flows_subset) == 0:
                    log.warning(f"no data found for flows in {aset}")
                    continue
                if len(flows_subset[flows_subset['FlowAmount'] != 0]) == 0:
                    log.warning(f"all flow data for {aset} is 0")
                    continue
                # if activities are sector-like, check sectors are valid
                if check_activities_sector_like(k):
                    flows_subset2 = replace_naics_w_naics_from_another_year(
                        flows_subset, method['target_sector_source'])

                    # check impact on df FlowAmounts
                    vLog.info(
                        'Calculate FlowAmount difference caused by '
                        'replacing NAICS Codes with %s, saving '
                        'difference in Validation log',
                        method['target_sector_source'],
                    )
                    calculate_flowamount_diff_between_dfs(
                        flows_subset, flows_subset2)
                else:
                    flows_subset2 = flows_subset.copy()

                # extract relevant geoscale data or aggregate existing data
                flows_subset_geo = subset_df_by_geoscale(
                    flows_subset2, v['geoscale_to_use'],
                    attr['allocation_from_scale'])
                # if loading data subnational geoscale, check for data loss
                if attr['allocation_from_scale'] != 'national':
                    compare_geographic_totals(flows_subset_geo, flows_mapped,
                                              k, attr, aset, names)

                # Add sectors to df activity, depending on level
                # of specified sector aggregation
                log.info("Adding sectors to %s", k)
                flows_subset_wsec = add_sectors_to_flowbyactivity(
                    flows_subset_geo,
                    sectorsourcename=method['target_sector_source'],
                    allocationmethod=attr['allocation_method'])
                # clean up fba with sectors, if specified in yaml
                if "clean_fba_w_sec_df_fxn" in v:
                    vLog.info("Cleaning up %s FlowByActivity with sectors", k)
                    flows_subset_wsec = dynamically_import_fxn(
                        k, v["clean_fba_w_sec_df_fxn"])(flows_subset_wsec,
                                                        attr=attr,
                                                        method=method)

                # rename SourceName to MetaSources and drop columns
                flows_mapped_wsec = flows_subset_wsec.\
                    rename(columns={'SourceName': 'MetaSources'}).\
                    drop(columns=['FlowName', 'Compartment'])

                # if allocation method is "direct", then no need
                # to create alloc ratios, else need to use allocation
                # dataframe to create sector allocation ratios
                if attr['allocation_method'] == 'direct':
                    fbs = direct_allocation_method(flows_mapped_wsec, k, names,
                                                   method)
                # if allocation method for an activity set requires a specific
                # function due to the complicated nature
                # of the allocation, call on function here
                elif attr['allocation_method'] == 'allocation_function':
                    fbs = function_allocation_method(flows_mapped_wsec, k,
                                                     names, attr, fbs_list)
                else:
                    fbs = dataset_allocation_method(flows_mapped_wsec, attr,
                                                    names, method, k, v, aset,
                                                    aset_names,
                                                    download_FBA_if_missing)

                # drop rows where flowamount = 0
                # (although this includes dropping suppressed data)
                fbs = fbs[fbs['FlowAmount'] != 0].reset_index(drop=True)

                # define grouping columns dependent on sectors
                # being activity-like or not
                if check_activities_sector_like(k) is False:
                    groupingcols = fbs_grouping_fields_w_activities
                    groupingdict = flow_by_sector_fields_w_activity
                else:
                    groupingcols = fbs_default_grouping_fields
                    groupingdict = flow_by_sector_fields

                # clean df
                fbs = clean_df(fbs, groupingdict, fbs_fill_na_dict)

                # aggregate df geographically, if necessary
                log.info("Aggregating flowbysector to %s level",
                         method['target_geoscale'])
                # determine from scale
                if fips_number_key[v['geoscale_to_use']] <\
                        fips_number_key[attr['allocation_from_scale']]:
                    from_scale = v['geoscale_to_use']
                else:
                    from_scale = attr['allocation_from_scale']

                fbs_geo_agg = agg_by_geoscale(fbs, from_scale,
                                              method['target_geoscale'],
                                              groupingcols)

                # aggregate data to every sector level
                log.info("Aggregating flowbysector to all sector levels")
                fbs_sec_agg = sector_aggregation(fbs_geo_agg, groupingcols)
                # add missing naics5/6 when only one naics5/6
                # associated with a naics4
                fbs_agg = sector_disaggregation(fbs_sec_agg)

                # check if any sector information is lost before reaching
                # the target sector length, if so,
                # allocate values equally to disaggregated sectors
                vLog.info(
                    'Searching for and allocating FlowAmounts for any parent '
                    'NAICS that were dropped in the subset to '
                    '%s child NAICS', method['target_sector_level'])
                fbs_agg_2 = equally_allocate_parent_to_child_naics(
                    fbs_agg, method['target_sector_level'])

                # compare flowbysector with flowbyactivity
                compare_activity_to_sector_flowamounts(flows_mapped_wsec,
                                                       fbs_agg_2, aset, k,
                                                       method)

                # return sector level specified in method yaml
                # load the crosswalk linking sector lengths
                sector_list = get_sector_list(method['target_sector_level'])

                # subset df, necessary because not all of the sectors are
                # NAICS and can get duplicate rows
                fbs_1 = fbs_agg_2.loc[
                    (fbs_agg_2[fbs_activity_fields[0]].isin(sector_list)) &
                    (fbs_agg_2[fbs_activity_fields[1]].isin(sector_list))].\
                    reset_index(drop=True)
                fbs_2 = fbs_agg_2.loc[
                    (fbs_agg_2[fbs_activity_fields[0]].isin(sector_list)) &
                    (fbs_agg_2[fbs_activity_fields[1]].isnull())].\
                    reset_index(drop=True)
                fbs_3 = fbs_agg_2.loc[
                    (fbs_agg_2[fbs_activity_fields[0]].isnull()) &
                    (fbs_agg_2[fbs_activity_fields[1]].isin(sector_list))].\
                    reset_index(drop=True)
                fbs_sector_subset = pd.concat([fbs_1, fbs_2, fbs_3])

                # drop activity columns
                fbs_sector_subset = fbs_sector_subset.drop(
                    ['ActivityProducedBy', 'ActivityConsumedBy'],
                    axis=1,
                    errors='ignore')

                # save comparison of FBA total to FBS total for an activity set
                compare_fba_geo_subset_and_fbs_output_totals(
                    flows_subset_geo, fbs_sector_subset, aset, k, v, attr,
                    method)

                log.info("Completed flowbysector for %s", aset)
                fbs_list.append(fbs_sector_subset)
        else:
            if 'clean_fbs_df_fxn' in v:
                flows = dynamically_import_fxn(v["clean_fbs_df_fxn_source"],
                                               v["clean_fbs_df_fxn"])(flows)
            flows = update_geoscale(flows, method['target_geoscale'])
            # if the loaded flow dt is already in FBS format,
            # append directly to list of FBS
            log.info("Append %s to FBS list", k)
            # ensure correct field datatypes and add any missing fields
            flows = clean_df(flows, flow_by_sector_fields, fbs_fill_na_dict)
            fbs_list.append(flows)
    # create single df of all activities
    log.info("Concat data for all activities")
    fbss = pd.concat(fbs_list, ignore_index=True, sort=False)
    log.info("Clean final dataframe")
    # add missing fields, ensure correct data type,
    # add missing columns, reorder columns
    fbss = clean_df(fbss, flow_by_sector_fields, fbs_fill_na_dict)
    # prior to aggregating, replace MetaSources string with all sources
    # that share context/flowable/sector values
    fbss = harmonize_FBS_columns(fbss)
    # aggregate df as activities might have data for
    # the same specified sector length
    fbss = aggregator(fbss, fbs_default_grouping_fields)
    # sort df
    log.info("Sort and store dataframe")
    # ensure correct data types/order of columns
    fbss = clean_df(fbss, flow_by_sector_fields, fbs_fill_na_dict)
    fbss = fbss.sort_values(
        ['SectorProducedBy', 'SectorConsumedBy', 'Flowable',
         'Context']).reset_index(drop=True)
    # check for negative flow amounts
    check_for_negative_flowamounts(fbss)
    # tmp reset data quality scores
    fbss = reset_fbs_dq_scores(fbss)
    # save parquet file
    meta = set_fb_meta(method_name, "FlowBySector")
    write_df_to_file(fbss, paths, meta)
    write_metadata(method_name, method, meta, "FlowBySector")
    # rename the log file saved to local directory
    rename_log_file(method_name, meta)
    log.info(
        'See the Validation log for detailed assessment of '
        'model results in %s', logoutputpath)
Exemple #22
0
def compare_fba_geo_subset_and_fbs_output_totals(fba_load, fbs_load,
                                                 activity_set, source_name,
                                                 source_attr, activity_attr,
                                                 method):
    """
    Function to compare the loaded flowbyactivity total after
    subsetting by activity and geography with the final flowbysector output
    total. Not a direct comparison of the loaded FBA because FBAs are
    modified before being subset by activity for the target sector level
    :param fba_load: df, FBA loaded, before being mapped
    :param fbs_load: df, final FBS df at target sector level
    :param activity_set: str, activity set
    :param source_name: str, source name
    :param source_attr: dictionary, attribute data from method yaml
        for source data
    :param activity_attr: dictionary, attribute data from method yaml
        for activity set
    :param method: dictionary, FBS method yaml
    :return: printout data differences between loaded FBA and FBS output
        totals by location, save results as csv in local directory
    """

    vLog.info('Comparing Flow-By-Activity subset by activity and geography to '
              'the subset Flow-By-Sector FlowAmount total.')

    # determine from scale
    if fips_number_key[source_attr['geoscale_to_use']] < \
            fips_number_key[activity_attr['allocation_from_scale']]:
        from_scale = source_attr['geoscale_to_use']
    else:
        from_scale = activity_attr['allocation_from_scale']

    # extract relevant geoscale data or aggregate existing data
    fba = subset_df_by_geoscale(fba_load, from_scale,
                                method['target_geoscale'])
    if check_activities_sector_like(source_name):
        # if activities are sector-like, run sector aggregation and then
        # subset df to only keep NAICS2
        fba = fba[[
            'Class', 'FlowAmount', 'Unit', 'Context', 'ActivityProducedBy',
            'ActivityConsumedBy', 'Location', 'LocationSystem'
        ]]
        # rename the activity cols to sector cols for purposes of aggregation
        fba = fba.rename(
            columns={
                'ActivityProducedBy': 'SectorProducedBy',
                'ActivityConsumedBy': 'SectorConsumedBy'
            })
        group_cols_agg = [
            'Class', 'Context', 'Unit', 'Location', 'LocationSystem',
            'SectorProducedBy', 'SectorConsumedBy'
        ]
        fba = sector_aggregation(fba, group_cols_agg)
        # subset fba to only include NAICS2
        fba = replace_NoneType_with_empty_cells(fba)
        fba = fba[fba['SectorConsumedBy'].apply(lambda x: len(x) == 2)
                  | fba['SectorProducedBy'].apply(lambda x: len(x) == 2)]
    # subset/agg dfs
    col_subset = [
        'Class', 'FlowAmount', 'Unit', 'Context', 'Location', 'LocationSystem'
    ]
    group_cols = ['Class', 'Unit', 'Context', 'Location', 'LocationSystem']
    # check units
    compare_df_units(fba, fbs_load)
    # fba
    fba = fba[col_subset]
    fba_agg = aggregator(fba, group_cols).reset_index(drop=True)
    fba_agg.rename(columns={
        'FlowAmount': 'FBA_amount',
        'Unit': 'FBA_unit'
    },
                   inplace=True)

    # fbs
    fbs = fbs_load[col_subset]
    fbs_agg = aggregator(fbs, group_cols)
    fbs_agg.rename(columns={
        'FlowAmount': 'FBS_amount',
        'Unit': 'FBS_unit'
    },
                   inplace=True)

    try:
        # merge FBA and FBS totals
        df_merge = fba_agg.merge(fbs_agg, how='left')
        df_merge['FlowAmount_difference'] = \
            df_merge['FBA_amount'] - df_merge['FBS_amount']
        df_merge['Percent_difference'] = \
            (df_merge['FlowAmount_difference']/df_merge['FBA_amount']) * 100

        # reorder
        df_merge = df_merge[[
            'Class', 'Context', 'Location', 'LocationSystem', 'FBA_amount',
            'FBA_unit', 'FBS_amount', 'FBS_unit', 'FlowAmount_difference',
            'Percent_difference'
        ]]
        df_merge = replace_NoneType_with_empty_cells(df_merge)

        # list of contexts and locations
        context_list = df_merge[['Context', 'Location']].values.tolist()

        # loop through the contexts and print results of comparison
        vLog.info(
            'Comparing FBA %s %s subset to FBS results. '
            'Details in Validation Log', activity_set,
            source_attr['geoscale_to_use'])
        for i, j in context_list:
            df_merge_subset = \
                df_merge[(df_merge['Context'] == i) &
                         (df_merge['Location'] == j)].reset_index(drop=True)
            diff_per = df_merge_subset['Percent_difference'][0]
            if np.isnan(diff_per):
                vLog.info(
                    'FlowBySector FlowAmount for %s %s %s '
                    'does not exist in the FBS', source_name, activity_set, i)
                continue
            # make reporting more manageable
            if abs(diff_per) > 0.01:
                diff_per = round(diff_per, 2)
            else:
                diff_per = round(diff_per, 6)

            # diff_units = df_merge_subset['FBS_unit'][0]
            if diff_per > 0:
                vLog.info(
                    'FlowBySector FlowAmount for %s %s %s at %s is %s%% '
                    'less than the FlowByActivity FlowAmount', source_name,
                    activity_set, i, j, str(abs(diff_per)))
            elif diff_per < 0:
                vLog.info(
                    'FlowBySector FlowAmount for %s %s %s at %s is %s%% '
                    'more than the FlowByActivity FlowAmount', source_name,
                    activity_set, i, j, str(abs(diff_per)))
            elif diff_per == 0:
                vLogDetailed.info(
                    'FlowBySector FlowAmount for '
                    '%s %s %s at %s is equal to the '
                    'FlowByActivity FlowAmount', source_name, activity_set, i,
                    j)

        # subset the df to include in the validation log
        # only print rows where the percent difference does not round to 0
        df_v = df_merge[df_merge['Percent_difference'].apply(
            lambda x: round(x, 3) != 0)].reset_index(drop=True)

        # log output
        log.info(
            'Save the comparison of FlowByActivity load to FlowBySector '
            'total FlowAmounts for %s in validation log file', activity_set)
        # if df not empty, print, if empty, print string
        if df_v.empty:
            vLogDetailed.info('Percent difference for %s all round to 0',
                              activity_set)
        else:
            vLogDetailed.info(
                'Comparison of FBA load to FBS total '
                'FlowAmounts for %s: '
                '\n {}'.format(df_v.to_string()), activity_set)
    except:
        vLog.info('Error occurred when comparing total FlowAmounts '
                  'for FlowByActivity and FlowBySector')
Exemple #23
0
def allocation_helper(df_w_sector, attr, method, v, download_FBA_if_missing):
    """
    Function to help allocate activity names using secondary df
    :param df_w_sector: df, includes sector columns
    :param attr: dictionary, attribute data from method yaml for activity set
    :param method: dictionary, FBS method yaml
    :param v: dictionary, the datasource parameters
    :param download_FBA_if_missing: bool, indicate if missing FBAs
       should be downloaded from Data Commons or run locally
    :return: df, with modified fba allocation values
    """
    from flowsa.validation import compare_df_units

    # add parameters to dictionary if exist in method yaml
    fba_dict = {}
    if 'helper_flow' in attr:
        fba_dict['flowname_subset'] = attr['helper_flow']
    if 'clean_helper_fba' in attr:
        fba_dict['clean_fba'] = attr['clean_helper_fba']
    if 'clean_helper_fba_wsec' in attr:
        fba_dict['clean_fba_w_sec'] = attr['clean_helper_fba_wsec']

    # load the allocation FBA
    helper_allocation = \
        load_map_clean_fba(method, attr, fba_sourcename=attr['helper_source'],
                           df_year=attr['helper_source_year'],
                           flowclass=attr['helper_source_class'],
                           geoscale_from=attr['helper_from_scale'],
                           geoscale_to=v['geoscale_to_use'],
                           download_FBA_if_missing=download_FBA_if_missing,
                           **fba_dict)

    # run sector disagg to capture any missing lower level naics
    helper_allocation = sector_disaggregation(helper_allocation)

    # generalize activity field names to enable link to water withdrawal table
    helper_allocation = collapse_activity_fields(helper_allocation)
    # drop any rows not mapped
    helper_allocation = \
        helper_allocation[helper_allocation['Sector'].notnull()]
    # drop columns
    helper_allocation = \
        helper_allocation.drop(columns=['Activity', 'Min', 'Max'])

    # rename column
    helper_allocation = \
        helper_allocation.rename(columns={"FlowAmount": 'HelperFlow'})

    # determine the df_w_sector column to merge on
    df_w_sector = replace_strings_with_NoneType(df_w_sector)
    sec_consumed_list = \
        df_w_sector['SectorConsumedBy'].drop_duplicates().values.tolist()
    sec_produced_list = \
        df_w_sector['SectorProducedBy'].drop_duplicates().values.tolist()
    # if a sector field column is not all 'none', that is the column to merge
    if all(v is None for v in sec_consumed_list):
        sector_col_to_merge = 'SectorProducedBy'
    elif all(v is None for v in sec_produced_list):
        sector_col_to_merge = 'SectorConsumedBy'
    else:
        log.error('There is not a clear sector column to base '
                  'merge with helper allocation dataset')

    # merge allocation df with helper df based on sectors,
    # depending on geo scales of dfs
    if (attr['helper_from_scale'] == 'state') and \
            (attr['allocation_from_scale'] == 'county'):
        helper_allocation.loc[:, 'Location_tmp'] = \
            helper_allocation['Location'].apply(lambda x: x[0:2])
        df_w_sector.loc[:, 'Location_tmp'] = \
            df_w_sector['Location'].apply(lambda x: x[0:2])
        # merge_columns.append('Location_tmp')
        compare_df_units(df_w_sector, helper_allocation)
        modified_fba_allocation =\
            df_w_sector.merge(
                helper_allocation[['Location_tmp', 'Sector', 'HelperFlow']],
                how='left',
                left_on=['Location_tmp', sector_col_to_merge],
                right_on=['Location_tmp', 'Sector'])
        modified_fba_allocation = \
            modified_fba_allocation.drop(columns=['Location_tmp'])
    elif (attr['helper_from_scale'] == 'national') and \
            (attr['allocation_from_scale'] != 'national'):
        compare_df_units(df_w_sector, helper_allocation)
        modified_fba_allocation = \
            df_w_sector.merge(helper_allocation[['Sector', 'HelperFlow']],
                              how='left',
                              left_on=[sector_col_to_merge],
                              right_on=['Sector'])
    else:

        compare_df_units(df_w_sector, helper_allocation)
        modified_fba_allocation =\
            df_w_sector.merge(
                helper_allocation[['Location', 'Sector', 'HelperFlow']],
                left_on=['Location', sector_col_to_merge],
                right_on=['Location', 'Sector'],
                how='left')
        # load bea codes that sub for naics
        bea = return_bea_codes_used_as_naics()
        # replace sector column and helperflow value if the sector column to
        # merge is in the bea list to prevent dropped data
        modified_fba_allocation['Sector'] = \
            np.where(modified_fba_allocation[sector_col_to_merge].isin(bea),
                     modified_fba_allocation[sector_col_to_merge],
                     modified_fba_allocation['Sector'])
        modified_fba_allocation['HelperFlow'] = \
            np.where(modified_fba_allocation[sector_col_to_merge].isin(bea),
                     modified_fba_allocation['FlowAmount'],
                     modified_fba_allocation['HelperFlow'])

    # modify flow amounts using helper data
    if 'multiplication' in attr['helper_method']:
        # if missing values (na or 0), replace with national level values
        replacement_values =\
            helper_allocation[helper_allocation['Location'] ==
                              US_FIPS].reset_index(drop=True)
        replacement_values = \
            replacement_values.rename(
                columns={"HelperFlow": 'ReplacementValue'})
        compare_df_units(modified_fba_allocation, replacement_values)
        modified_fba_allocation = modified_fba_allocation.merge(
            replacement_values[['Sector', 'ReplacementValue']], how='left')
        modified_fba_allocation.loc[:, 'HelperFlow'] = \
            modified_fba_allocation['HelperFlow'].fillna(
            modified_fba_allocation['ReplacementValue'])
        modified_fba_allocation.loc[:, 'HelperFlow'] =\
            np.where(modified_fba_allocation['HelperFlow'] == 0,
                     modified_fba_allocation['ReplacementValue'],
                     modified_fba_allocation['HelperFlow'])

        # replace non-existent helper flow values with a 0,
        # so after multiplying, don't have incorrect value associated with
        # new unit
        modified_fba_allocation['HelperFlow'] =\
            modified_fba_allocation['HelperFlow'].fillna(value=0)
        modified_fba_allocation.loc[:, 'FlowAmount'] = \
            modified_fba_allocation['FlowAmount'] * \
            modified_fba_allocation['HelperFlow']
        # drop columns
        modified_fba_allocation =\
            modified_fba_allocation.drop(
                columns=["HelperFlow", 'ReplacementValue', 'Sector'])

    elif attr['helper_method'] == 'proportional':
        modified_fba_allocation =\
            proportional_allocation_by_location_and_activity(
                modified_fba_allocation, sector_col_to_merge)
        modified_fba_allocation['FlowAmountRatio'] =\
            modified_fba_allocation['FlowAmountRatio'].fillna(0)
        modified_fba_allocation.loc[:, 'FlowAmount'] = \
            modified_fba_allocation['FlowAmount'] * \
            modified_fba_allocation['FlowAmountRatio']
        modified_fba_allocation =\
            modified_fba_allocation.drop(
                columns=['FlowAmountRatio', 'HelperFlow', 'Sector'])

    elif attr['helper_method'] == 'proportional-flagged':
        # calculate denominators based on activity and 'flagged' column
        modified_fba_allocation =\
            modified_fba_allocation.assign(
                Denominator=modified_fba_allocation.groupby(
                    ['FlowName', 'ActivityConsumedBy', 'Location',
                     'disaggregate_flag'])['HelperFlow'].transform('sum'))
        modified_fba_allocation = modified_fba_allocation.assign(
            FlowAmountRatio=modified_fba_allocation['HelperFlow'] /
            modified_fba_allocation['Denominator'])
        modified_fba_allocation =\
            modified_fba_allocation.assign(
                FlowAmount=modified_fba_allocation['FlowAmount'] *
                           modified_fba_allocation['FlowAmountRatio'])
        modified_fba_allocation =\
            modified_fba_allocation.drop(
                columns=['disaggregate_flag', 'Sector', 'HelperFlow',
                         'Denominator', 'FlowAmountRatio'])
        # run sector aggregation
        modified_fba_allocation = \
            sector_aggregation(modified_fba_allocation,
                               fba_wsec_default_grouping_fields)

    # drop rows of 0
    modified_fba_allocation =\
        modified_fba_allocation[
            modified_fba_allocation['FlowAmount'] != 0].reset_index(drop=True)

    modified_fba_allocation.loc[modified_fba_allocation['Unit'] ==
                                'gal/employee', 'Unit'] = 'gal'

    # option to scale up fba values
    if 'scaled' in attr['helper_method']:
        log.info("Scaling %s to FBA values", attr['helper_source'])
        modified_fba_allocation = \
            dynamically_import_fxn(
                attr['allocation_source'], attr["scale_helper_results"])(
                modified_fba_allocation, attr,
                download_FBA_if_missing=download_FBA_if_missing)
    return modified_fba_allocation